From b528b1091fcae788cd686c44902f112d73a3010c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 27 Sep 2023 11:14:10 -0500 Subject: [PATCH 01/83] Use cmake_minimum_required with min...max --- CMakeLists.txt | 1 + .../pybind11/external_usm_allocation/CMakeLists.txt | 12 ++++++------ examples/pybind11/onemkl_gemv/CMakeLists.txt | 11 ++++++----- .../pybind11/use_dpctl_sycl_kernel/CMakeLists.txt | 8 ++++---- .../pybind11/use_dpctl_sycl_queue/CMakeLists.txt | 12 ++++++------ .../cmake/modules/GetProjectVersion.cmake | 2 +- 6 files changed, 24 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b91813feda..f506f1e2fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) project(dpctl + VERSION 0.15 LANGUAGES CXX DESCRIPTION "Python interface for XPU programming" ) diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt index ce231fad4a..db8c6c9aa6 100644 --- a/examples/pybind11/external_usm_allocation/CMakeLists.txt +++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt @@ -1,6 +1,7 @@ -cmake_minimum_required(VERSION 3.21) +cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) -project(external_usm_allocation LANGUAGES CXX) +project(external_usm_allocation VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of passing external C++ USM allocation to Python") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) @@ -13,14 +14,13 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) include(FetchContent) FetchContent_Declare( pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz - URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz + URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) -find_package(NumPy REQUIRED) set(py_module_name _external_usm_alloc) pybind11_add_module(${py_module_name} diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt index 25589e4202..c2ac5fc516 100644 --- a/examples/pybind11/onemkl_gemv/CMakeLists.txt +++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt @@ -1,6 +1,7 @@ -cmake_minimum_required(VERSION 3.22 FATAL_ERROR) +cmake_minimum_required(VERSION 3.22...3.27 FATAL_ERROR) -project(example_use_mkl_gemm LANGUAGES CXX) +project(example_use_mkl_gemm VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of using Python wrapper to oneMKL function") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) @@ -17,12 +18,12 @@ include(GNUInstallDirs) include(FetchContent) FetchContent_Declare( pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz - URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz + URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) find_library(mkl_core NAMES mkl_core PATHS ${MKL_LIBRARY_DIR} REQUIRED) diff --git a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt index f246d29924..32770aa750 100644 --- a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt +++ b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt @@ -1,6 +1,7 @@ -cmake_minimum_required(VERSION 3.21) +cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) -project(use_queue_device LANGUAGES CXX) +project(use_queue_device VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of using dpctl.program.SyclKernel <-> sycl::kernel type casting") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) @@ -19,9 +20,8 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) -find_package(NumPy REQUIRED) set(py_module_name _use_kernel) pybind11_add_module(${py_module_name} diff --git a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt index f7b843d7f5..4ee47e71a9 100644 --- a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt +++ b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt @@ -1,6 +1,7 @@ -cmake_minimum_required(VERSION 3.21) +cmake_minimum_required(VERSION 3.21...3.27 FATAL_ERROR) -project(use_queue_device LANGUAGES CXX) +project(use_queue_device VERSION 0.1 LANGUAGES CXX + DESCRIPTION "Example of using dpctl.SyclQueue <-> sycl::queue type caster") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) @@ -13,14 +14,13 @@ set(CMAKE_CXX_STANDARD_REQUIRED True) include(FetchContent) FetchContent_Declare( pybind11 - URL https://github.com/pybind/pybind11/archive/refs/tags/v2.10.2.tar.gz - URL_HASH SHA256=93bd1e625e43e03028a3ea7389bba5d3f9f2596abc074b068e70f4ef9b1314ae + URL https://github.com/pybind/pybind11/archive/refs/tags/v2.11.1.tar.gz + URL_HASH SHA256=d475978da0cdc2d43b73f30910786759d593a9d8ee05b1b6846d1eb16c6d2e0c ) FetchContent_MakeAvailable(pybind11) -find_package(PythonExtensions REQUIRED) +find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) -find_package(NumPy REQUIRED) set(py_module_name _use_queue_device) pybind11_add_module(${py_module_name} diff --git a/libsyclinterface/cmake/modules/GetProjectVersion.cmake b/libsyclinterface/cmake/modules/GetProjectVersion.cmake index c0f4ec4a6f..a863a4ee17 100644 --- a/libsyclinterface/cmake/modules/GetProjectVersion.cmake +++ b/libsyclinterface/cmake/modules/GetProjectVersion.cmake @@ -29,7 +29,7 @@ # VERSION_MINOR # VERSION # SEMVER -cmake_minimum_required( VERSION 3.14.0 ) +cmake_minimum_required(VERSION 3.14...3.27 FATAL_ERROR ) function(get_version) # Use git describe to get latest tag name From a9aae289b74dc9f049155a9ac53a74a0a02a394a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 27 Sep 2023 13:15:32 -0500 Subject: [PATCH 02/83] Fix upload_linux/upload_windows steps --- .github/workflows/conda-package.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index f1435f0ccc..09806bcce9 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -320,11 +320,16 @@ jobs: matrix: python: ['3.9', '3.10', '3.11'] steps: - - name: Download artifact + - name: Download conda artifact uses: actions/download-artifact@v3 with: name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }} + - name: Download wheel artifact + uses: actions/download-artifact@v3 + with: + name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }} + - name: Install anaconda-client run: conda install anaconda-client - name: Add conda to system path @@ -353,10 +358,17 @@ jobs: uses: actions/download-artifact@v3 with: name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Python ${{ matrix.python }} + + - name: Download wheel artifact + uses: actions/download-artifact@v3 + with: + name: ${{ env.PACKAGE_NAME }} ${{ runner.os }} Wheels Python ${{ matrix.python }} + - uses: conda-incubator/setup-miniconda@v2 with: auto-activate-base: true activate-environment: "" + - name: Install anaconda-client run: conda install anaconda-client From 71e891ce2701307d9f0beb71c4fe46cd215e087d Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Tue, 3 Oct 2023 09:23:15 -0700 Subject: [PATCH 03/83] Check in of generic reduction templates and some reductions (#1399) * Implements necessary sycl utilities for custom reductions * Implements dpctl.tensor.max and dpctl.tensor.min * Adds tests for min and max * Reductions now set max_wg to the minimum of the max work group size and 2048 - This prevents running out of resources when using local memory on CPU * max and min nan propagation fixed for CPU devices - drops use of fetch_max/fetch_min for floats, which do not handle nans correctly * Tweak to test_reduction_kernels * Implements dpctl.tensor.argmax and argmin * Tests for argmin and argmax Also fixes argmin and argmax for scalar inputs * Argmin and argmax now handle identities correctly Adds a test for this behavior Fixed a typo in argmin and argmax causing shared local memory variant to be used for more types than expected * Replaced `std::min` with `idx_reduction_op_` * reductions now well-behaved for size-zero arrays - comparison and search reductions will throw an error in this case - slips in change to align sum signature with array API spec * removed unnecessary copies in reduction templates * Refactors sum to use generic reduction templates * Sum now uses a generic Python API * Docstrings added for argmax, argmin, max, and min * Small reduction clean-ups Removed unnecessary copies in custom_reduce_over_group Sequential reduction now casts before calling operator (makes behavior explicit rather than implicit) * Added test for argmin with keepdims=True * Added a test for raised errors in reductions Also removed unused `_usm_types` in `test_tensor_sum` * Removed `void` overloads from reduction utilities These were unused by dpctl * Added missing include, Identity to use has_known_identity Implementation of Identity trait should call sycl::known_identity if trait sycl::has_known_identity is a true_type. Added IsMultiplies, and identity value for it, since sycl::known_identity for multiplies is only defined for real-valued types. * Adding functor factories for product over axis * Added Python API for _prod_over_axis * Common reduction template takes functions to test if atomics are applicable Passing these function pointers around allows to turn atomic off altogether if desired. Use custom trait to check if reduce_over_groups can be used. This allows to work-around bug, or switch to custom code for reduction over group if desired. Such custom trait type works around issue with incorrect result returned from sycl::reduce_over_group for sycl::multiplies operator for 64-bit integral types. * Defined dpctl.tensor.prod Also tweaked docstring for sum. * Added tests for dpt.prod, removed uses of numpy * Corrected prod docstring Small tweaks to sum, min, and max docstrings --------- Co-authored-by: Oleksandr Pavlyk --- dpctl/tensor/CMakeLists.txt | 3 +- dpctl/tensor/__init__.py | 7 +- dpctl/tensor/_reduction.py | 414 +++- .../libtensor/include/kernels/reductions.hpp | 2193 +++++++++++++++-- .../libtensor/include/utils/sycl_utils.hpp | 242 ++ .../libtensor/source/reduction_over_axis.cpp | 514 ++++ ...reductions.cpp => reduction_over_axis.hpp} | 463 ++-- .../libtensor/source/sum_reductions.hpp | 40 - dpctl/tensor/libtensor/source/tensor_py.cpp | 2 +- dpctl/tests/test_tensor_sum.py | 79 +- dpctl/tests/test_usm_ndarray_reductions.py | 236 ++ 11 files changed, 3759 insertions(+), 434 deletions(-) create mode 100644 dpctl/tensor/libtensor/source/reduction_over_axis.cpp rename dpctl/tensor/libtensor/source/{sum_reductions.cpp => reduction_over_axis.hpp} (57%) delete mode 100644 dpctl/tensor/libtensor/source/sum_reductions.hpp create mode 100644 dpctl/tests/test_usm_ndarray_reductions.py diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 456eebdbaa..9a2493421e 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -49,8 +49,8 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ) set(_clang_prefix "") if (WIN32) @@ -60,6 +60,7 @@ set_source_files_properties( ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math") if (UNIX) set_source_files_properties( diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index f0930004ec..3473d5cde5 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -160,7 +160,7 @@ tanh, trunc, ) -from ._reduction import sum +from ._reduction import argmax, argmin, max, min, prod, sum from ._testing import allclose __all__ = [ @@ -309,4 +309,9 @@ "allclose", "repeat", "tile", + "max", + "min", + "argmax", + "argmin", + "prod", ] diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index d9bd6b5b2b..aac1c84677 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -52,18 +52,107 @@ def _default_reduction_dtype(inp_dt, q): return res_dt -def sum(arr, axis=None, dtype=None, keepdims=False): +def _reduction_over_axis( + x, + axis, + dtype, + keepdims, + _reduction_fn, + _dtype_supported, + _default_reduction_type_fn, + _identity=None, +): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + perm = [i for i in range(nd) if i not in axis] + list(axis) + arr2 = dpt.permute_dims(x, perm) + res_shape = arr2.shape[: nd - red_nd] + q = x.sycl_queue + inp_dt = x.dtype + if dtype is None: + res_dt = _default_reduction_type_fn(inp_dt, q) + else: + res_dt = dpt.dtype(dtype) + res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) + + res_usm_type = x.usm_type + if x.size == 0: + if _identity is None: + raise ValueError("reduction does not support zero-size arrays") + else: + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res_shape = tuple(res_shape[i] for i in inv_perm) + return dpt.full( + res_shape, + _identity, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + if red_nd == 0: + return dpt.astype(x, res_dt, copy=False) + + host_tasks_list = [] + if _dtype_supported(inp_dt, res_dt, res_usm_type, q): + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e, _ = _reduction_fn( + src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q + ) + host_tasks_list.append(ht_e) + else: + if dtype is None: + raise RuntimeError( + "Automatically determined reduction data type does not " + "have direct implementation" + ) + tmp_dt = _default_reduction_dtype(inp_dt, q) + tmp = dpt.empty( + res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_tmp, r_e = _reduction_fn( + src=arr2, trailing_dims_to_reduce=red_nd, dst=tmp, sycl_queue=q + ) + host_tasks_list.append(ht_e_tmp) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp, dst=res, sycl_queue=q, depends=[r_e] + ) + host_tasks_list.append(ht_e) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + dpctl.SyclEvent.wait_for(host_tasks_list) + + return res + + +def sum(x, axis=None, dtype=None, keepdims=False): """sum(x, axis=None, dtype=None, keepdims=False) - Calculates the sum of the input array `x`. + Calculates the sum of elements in the input array `x`. Args: x (usm_ndarray): input array. - axis (Optional[int, Tuple[int,...]]): + axis (Optional[int, Tuple[int, ...]]): axis or axes along which sums must be computed. If a tuple of unique integers, sums are computed over multiple axes. - If `None`, the sum if computed over the entire array. + If `None`, the sum is computed over the entire array. Default: `None`. dtype (Optional[dtype]): data type of the returned array. If `None`, the default data @@ -101,9 +190,84 @@ def sum(arr, axis=None, dtype=None, keepdims=False): array has the data type as described in the `dtype` parameter description above. """ - if not isinstance(arr, dpt.usm_ndarray): - raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(arr)}") - nd = arr.ndim + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + ti._sum_over_axis, + ti._sum_over_axis_dtype_supported, + _default_reduction_dtype, + _identity=0, + ) + + +def prod(x, axis=None, dtype=None, keepdims=False): + """prod(x, axis=None, dtype=None, keepdims=False) + + Calculates the product of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which products must be computed. If a tuple + of unique integers, products are computed over multiple axes. + If `None`, the product is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data typee, + the returned array will have the default complex-valued + floating-pointer data type for the device where input + array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the product. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the products. If the product was computed over + the entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + ti._prod_over_axis, + ti._prod_over_axis_dtype_supported, + _default_reduction_dtype, + _identity=1, + ) + + +def _comparison_over_axis(x, axis, keepdims, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim if axis is None: axis = tuple(range(nd)) if not isinstance(axis, (tuple, list)): @@ -111,63 +275,201 @@ def sum(arr, axis=None, dtype=None, keepdims=False): axis = normalize_axis_tuple(axis, nd, "axis") red_nd = len(axis) perm = [i for i in range(nd) if i not in axis] + list(axis) - arr2 = dpt.permute_dims(arr, perm) - res_shape = arr2.shape[: nd - red_nd] - q = arr.sycl_queue - inp_dt = arr.dtype - if dtype is None: - res_dt = _default_reduction_dtype(inp_dt, q) - else: - res_dt = dpt.dtype(dtype) - res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) - - res_usm_type = arr.usm_type - if arr.size == 0: - if keepdims: - res_shape = res_shape + (1,) * red_nd - inv_perm = sorted(range(nd), key=lambda d: perm[d]) - res_shape = tuple(res_shape[i] for i in inv_perm) - return dpt.zeros( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q - ) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = x.dtype + res_usm_type = x.usm_type + if x.size == 0: + raise ValueError("reduction does not support zero-size arrays") if red_nd == 0: - return dpt.astype(arr, res_dt, copy=False) + return x - host_tasks_list = [] - if ti._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): - res = dpt.empty( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q - ) - ht_e, _ = ti._sum_over_axis( - src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q - ) - host_tasks_list.append(ht_e) + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev, _ = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=exec_q, + ) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + hev.wait() + return res + + +def max(x, axis=None, keepdims=False): + """max(x, axis=None, dtype=None, keepdims=False) + + Calculates the maximum value of the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which maxima must be computed. If a tuple + of unique integers, the maxima are computed over multiple axes. + If `None`, the max is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the maxima. If the max was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as `x`. + """ + return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis) + + +def min(x, axis=None, keepdims=False): + """min(x, axis=None, dtype=None, keepdims=False) + + Calculates the minimum value of the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which minima must be computed. If a tuple + of unique integers, the minima are computed over multiple axes. + If `None`, the min is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the minima. If the min was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as `x`. + """ + return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis) + + +def _search_over_axis(x, axis, keepdims, _reduction_fn): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + elif isinstance(axis, int): + axis = (axis,) else: - if dtype is None: - raise RuntimeError( - "Automatically determined reduction data type does not " - "have direct implementation" - ) - tmp_dt = _default_reduction_dtype(inp_dt, q) - tmp = dpt.empty( - res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q - ) - ht_e_tmp, r_e = ti._sum_over_axis( - src=arr2, trailing_dims_to_reduce=red_nd, dst=tmp, sycl_queue=q - ) - host_tasks_list.append(ht_e_tmp) - res = dpt.empty( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + raise TypeError( + f"`axis` argument expected `int` or `None`, got {type(axis)}" ) - ht_e, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=tmp, dst=res, sycl_queue=q, depends=[r_e] + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + exec_q = x.sycl_queue + res_dt = ti.default_device_index_type(exec_q.sycl_device) + res_usm_type = x.usm_type + if x.size == 0: + raise ValueError("reduction does not support zero-size arrays") + if red_nd == 0: + return dpt.zeros( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q ) - host_tasks_list.append(ht_e) + + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev, _ = _reduction_fn( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=exec_q, + ) if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) - dpctl.SyclEvent.wait_for(host_tasks_list) - + hev.wait() return res + + +def argmax(x, axis=None, keepdims=False): + """argmax(x, axis=None, dtype=None, keepdims=False) + + Returns the indices of the maximum values of the input array `x` along a + specified axis. + + When the maximum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If `None`, returns the index of the + maximum value of the flattened array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + maximum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of `x`. + """ + return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis) + + +def argmin(x, axis=None, keepdims=False): + """argmin(x, axis=None, dtype=None, keepdims=False) + + Returns the indices of the minimum values of the input array `x` along a + specified axis. + + When the minimum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If `None`, returns the index of the + minimum value of the flattened array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + minimum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of `x`. + """ + return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 7dfc956492..7cb97cd4f9 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include @@ -32,6 +33,7 @@ #include #include "pybind11/pybind11.h" +#include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/sycl_utils.hpp" #include "utils/type_dispatch.hpp" @@ -39,6 +41,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; namespace dpctl { @@ -47,6 +50,14 @@ namespace tensor namespace kernels { +template struct can_use_reduce_over_group +{ + static constexpr bool value = + sycl::has_known_identity::value && + !std::is_same_v && !std::is_same_v && + !std::is_same_v>; +}; + template (inp_[inp_offset]); + red_val = reduction_op_(red_val, val); } out_[out_iter_offset] = red_val; @@ -153,7 +166,7 @@ struct ReductionOverGroupWithAtomicFunctor const size_t reduction_lid = it.get_local_id(0); const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg - // work-items sums over input with indices + // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg // + reduction_lid // for 0 <= m < reductions_per_wi @@ -191,11 +204,17 @@ struct ReductionOverGroupWithAtomicFunctor sycl::memory_scope::device, sycl::access::address_space::global_space> res_ref(out_[out_iter_offset]); - if constexpr (std::is_same_v> || - std::is_same_v>) - { + if constexpr (su_ns::IsPlus::value) { res_ref += red_val_over_wg; } + else if constexpr (std::is_same_v>) + { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (std::is_same_v>) + { + res_ref.fetch_min(red_val_over_wg); + } else { outT read_val = res_ref.load(); outT new_val{}; @@ -207,7 +226,103 @@ struct ReductionOverGroupWithAtomicFunctor } }; -typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( +/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */ + +template +struct CustomReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) + { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } +}; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( sycl::queue &, size_t, size_t, @@ -223,27 +338,51 @@ typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( const std::vector &); template -class sum_reduction_over_group_with_atomics_krn; +class reduction_over_group_with_atomics_krn; + +template +class custom_reduction_over_group_with_atomics_krn; -template -class sum_reduction_over_group_with_atomics_init_krn; +template +class reduction_over_group_with_atomics_init_krn; template -class sum_reduction_seq_strided_krn; +class reduction_seq_strided_krn; template -class sum_reduction_seq_contig_krn; +class reduction_seq_contig_krn; template -class sum_reduction_axis0_over_group_with_atomics_contig_krn; +class reduction_axis0_over_group_with_atomics_contig_krn; + +template +class custom_reduction_axis0_over_group_with_atomics_contig_krn; template -class sum_reduction_axis1_over_group_with_atomics_contig_krn; +class reduction_axis1_over_group_with_atomics_contig_krn; + +template +class custom_reduction_axis1_over_group_with_atomics_contig_krn; using dpctl::tensor::sycl_utils::choose_workgroup_size; -template -sycl::event sum_reduction_over_group_with_atomics_strided_impl( +template +sycl::event reduction_over_group_with_atomics_strided_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -263,8 +402,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -285,7 +423,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - cgh.parallel_for>( sycl::range<1>(iter_nelems), @@ -308,8 +446,8 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, res_strides); using InitKernelName = - class sum_reduction_over_group_with_atomics_init_krn; + class reduction_over_group_with_atomics_init_krn; cgh.depends_on(depends); cgh.parallel_for( @@ -347,18 +485,38 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_with_atomics_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_with_atomics_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_with_atomics_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); return comp_ev; @@ -367,7 +525,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( // Contig -typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( +typedef sycl::event (*reduction_contig_impl_fn_ptr)( sycl::queue &, size_t, size_t, @@ -379,8 +537,8 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( const std::vector &); /* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( +template +sycl::event reduction_axis1_over_group_with_atomics_contig_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -397,8 +555,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -422,7 +579,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; - cgh.parallel_for>( sycl::range<1>(iter_nelems), @@ -470,28 +627,47 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = - class sum_reduction_axis1_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class + custom_reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); - return comp_ev; } } /* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( +template +sycl::event reduction_axis0_over_group_with_atomics_contig_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of cols in a matrix // when reducing over cols) @@ -508,8 +684,8 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; + ; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -551,21 +727,40 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = - class sum_reduction_axis0_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class + custom_reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); - return comp_ev; } } @@ -618,7 +813,7 @@ struct ReductionOverGroupNoAtomicFunctor const size_t reduction_batch_id = it.get_group(0) / iter_gws_; const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - // work-items sums over input with indices + // work-items operates over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg // + reduction_lid // for 0 <= m < reductions_per_wi @@ -658,11 +853,110 @@ struct ReductionOverGroupNoAtomicFunctor } }; -template -class sum_reduction_over_group_temps_krn; +/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/ + +template +struct CustomReductionOverGroupNoAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + + local_red_val = reduction_op_(local_red_val, val); + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; -template -sycl::event sum_reduction_over_group_temps_strided_impl( +template +class reduction_over_group_temps_krn; + +template +class custom_reduction_over_group_temps_krn; + +template +sycl::event reduction_over_group_temps_strided_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -682,19 +976,21 @@ sycl::event sum_reduction_over_group_temps_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); constexpr size_t preferrered_reductions_per_wi = 4; - size_t max_wg = d.get_info(); + // max_max_wg prevents running out of resources on CPU + constexpr size_t max_max_wg = 2048; + size_t max_wg = std::min( + max_max_wg, d.get_info()); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requires 1 work-group, can output directly to res + // reduction only requries 1 work-group, can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -722,19 +1018,38 @@ sycl::event sum_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); - }); + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); return comp_ev; } else { @@ -773,9 +1088,10 @@ sycl::event sum_reduction_over_group_temps_strided_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::StridedIndexer; - // Only 2*iter_nd entries describing shape and strides of iterated - // dimensions of input array from iter_shape_and_strides are going - // to be accessed by inp_indexer + // Only 2*iter_nd entries describing shape and strides of + // iterated dimensions of input array from + // iter_shape_and_strides are going to be accessed by + // inp_indexer InputIndexerT inp_indexer(iter_nd, iter_arg_offset, iter_shape_and_strides); ResIndexerT noop_tmp_indexer{}; @@ -789,17 +1105,37 @@ sycl::event sum_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } }); size_t remaining_reduction_nelems = reduction_groups; @@ -817,34 +1153,34 @@ sycl::event sum_reduction_over_group_temps_strided_impl( assert(reduction_groups_ > 1); // keep reducing - sycl::event partial_reduction_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_ev); - - using InputIndexerT = - dpctl::tensor::offset_utils::Strided1DIndexer; - using ResIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - - InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; - ResIndexerT res_iter_indexer{}; - - InputOutputIterIndexerT in_out_iter_indexer{ - inp_indexer, res_iter_indexer}; - ReductionIndexerT reduction_indexer{}; - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups_ * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_temps_krn< + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = class reduction_over_group_temps_krn< resTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; cgh.parallel_for( @@ -856,7 +1192,25 @@ sycl::event sum_reduction_over_group_temps_strided_impl( in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, preferrered_reductions_per_wi)); - }); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_temps_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + local_memory, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); remaining_reduction_nelems = reduction_groups_; std::swap(temp_arg, temp2_arg); @@ -900,18 +1254,37 @@ sycl::event sum_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - temp_arg, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, - remaining_reduction_nelems, iter_nelems, - reductions_per_wi)); + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, + remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } }); sycl::event cleanup_host_task_event = @@ -931,31 +1304,332 @@ sycl::event sum_reduction_over_group_temps_strided_impl( } } -/* @brief Types supported by plus-reduction code based on atomic_ref */ +/* @brief Types supported by comparison-reduction code based on atomic_ref */ template -struct TypePairSupportDataForSumReductionAtomic +struct TypePairSupportDataForCompReductionAtomic { /* value if true a kernel for must be instantiated, false * otherwise */ static constexpr bool is_defined = std::disjunction< // disjunction is C++17 // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + // by DPC++ + // input int32 + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForCompReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MaxOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +// Sum + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForSumReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, @@ -1105,9 +1779,10 @@ struct SumOverAxisAtomicStridedFactory if constexpr (TypePairSupportDataForSumReductionAtomic< srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; return dpctl::tensor::kernels:: - sum_reduction_over_group_with_atomics_strided_impl; + reduction_over_group_with_atomics_strided_impl; } else { return nullptr; @@ -1122,8 +1797,10 @@ struct SumOverAxisTempsStridedFactory { if constexpr (TypePairSupportDataForSumReductionTemps< srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; return dpctl::tensor::kernels:: - sum_reduction_over_group_temps_strided_impl; + reduction_over_group_temps_strided_impl; } else { return nullptr; @@ -1139,9 +1816,10 @@ struct SumOverAxis1AtomicContigFactory if constexpr (TypePairSupportDataForSumReductionAtomic< srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; return dpctl::tensor::kernels:: - sum_reduction_axis1_over_group_with_atomics_contig_impl; + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; } else { return nullptr; @@ -1157,9 +1835,1188 @@ struct SumOverAxis0AtomicContigFactory if constexpr (TypePairSupportDataForSumReductionAtomic< srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; return dpctl::tensor::kernels:: - sum_reduction_axis0_over_group_with_atomics_contig_impl; + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +// Product + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForProductReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForProductReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ProductOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +// Argmax and Argmin + +/* = Search reduction using reduce_over_group*/ + +template +struct SearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + SearchReduction(const argT *data, + argT *vals, + const outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if constexpr (std::is_integral_v) { + local_idx = + (red_val_over_wg == local_red_val) ? local_idx : idx_identity_; + } + else { + local_idx = + (red_val_over_wg == local_red_val || + std::isnan(red_val_over_wg) || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +/* = Search reduction using custom_reduce_over_group*/ + +template +struct CustomSearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomSearchReduction(const argT *data, + argT *vals, + outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so + // check + if (less_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v) { + if (val < local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v) { + if (val > local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + // equality does not hold for NaNs, so check here + local_idx = (red_val_over_wg == local_red_val || + std::isnan(std::real(local_red_val)) || + std::isnan(std::imag(local_red_val))) + ? local_idx + : idx_identity_; + } + else if constexpr (std::is_floating_point_v) { + // equality does not hold for NaNs, so check here + local_idx = + (red_val_over_wg == local_red_val || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + else { + local_idx = + red_val_over_wg == local_red_val ? local_idx : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +typedef sycl::event (*search_reduction_strided_impl_fn_ptr)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + const std::vector &); + +template +class search_reduction_over_group_temps_krn; + +template +class search_custom_reduction_over_group_temps_krn; + +using dpctl::tensor::sycl_utils::choose_workgroup_size; + +template +sycl::event search_reduction_over_group_temps_strided_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const py::ssize_t *iter_shape_and_strides, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + int red_nd, + const py::ssize_t *reduction_shape_stride, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + constexpr argTy identity_val = su_ns::Identity::value; + constexpr resTy idx_identity_val = su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + constexpr size_t preferrered_reductions_per_wi = 4; + // max_max_wg prevents running out of resources on CPU + size_t max_wg = std::min( + size_t(2048), d.get_info()); + + size_t reductions_per_wi(preferrered_reductions_per_wi); + if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { + // reduction only requries 1 work-group, can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + argTy *partially_reduced_vals_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + argTy *partially_reduced_vals_tmp2 = nullptr; + + if (partially_reduced_vals_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + } + + sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of iterated + // dimensions of input array from iter_shape_and_strides are going + // to be accessed by inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferrered_reductions_per_wi * max_wg) { + size_t reduction_groups_ = + (remaining_reduction_nelems + + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, + false, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /*s trides */ iter_shape_and_strides + + 2 * iter_nd}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + sycl::context ctx = exec_q.get_context(); + + cgh.host_task( + [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { + sycl::free(partially_reduced_tmp, ctx); + sycl::free(partially_reduced_vals_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +struct TypePairSupportDataForSearchReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ArgmaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } } else { return nullptr; diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 2fc7b02efa..0d4240c516 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -26,14 +26,79 @@ #include #include #include +#include #include +#include "math_utils.hpp" + namespace dpctl { namespace tensor { namespace sycl_utils { +namespace detail +{ + +template struct TypeList; + +template struct TypeList +{ + using head = Head; + using tail = TypeList; +}; + +using NullTypeList = TypeList<>; +template +struct IsNullTypeList : std::conditional_t, + std::true_type, + std::false_type> +{ +}; + +// recursively check if type is contained in given TypeList +template +struct IsContained + : std::conditional_t< + std::is_same_v>, + std::true_type, + IsContained> +{ +}; + +template <> struct TypeList<> +{ +}; + +// std::false_type when last case has been checked for membership +template struct IsContained : std::false_type +{ +}; + +template struct IsComplex : std::false_type +{ +}; +template struct IsComplex> : std::true_type +{ +}; + +} // namespace detail + +template +using sycl_ops = detail::TypeList, + sycl::bit_or, + sycl::bit_xor, + sycl::bit_and, + sycl::maximum, + sycl::minimum, + sycl::multiplies>; + +template struct IsSyclOp +{ + static constexpr bool value = + detail::IsContained>>::value || + detail::IsContained>>::value; +}; /*! @brief Find the smallest multiple of supported sub-group size larger than * nelems */ @@ -66,6 +131,183 @@ size_t choose_workgroup_size(const size_t nelems, return wg; } +template +T custom_reduce_over_group(const GroupT &wg, + LocAccT local_mem_acc, + const T &local_val, + const OpT &op) +{ + size_t wgs = wg.get_local_linear_range(); + local_mem_acc[wg.get_local_linear_id()] = local_val; + + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + T red_val_over_wg = local_mem_acc[0]; + if (wg.leader()) { + for (size_t i = 1; i < wgs; ++i) { + red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]); + } + } + + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + return sycl::group_broadcast(wg, red_val_over_wg); +} + +// Reduction functors + +// Maximum + +template struct Maximum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::max_complex; + return max_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x > y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x || y; + } + else { + return (x > y) ? x : y; + } + } +}; + +// Minimum + +template struct Minimum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::min_complex; + return min_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x < y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x && y; + } + else { + return (x < y) ? x : y; + } + } +}; + +// Define identities and operator checking structs + +template struct GetIdentity +{ +}; + +// Maximum + +template +using IsMaximum = std::bool_constant> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(-std::numeric_limits::infinity()) + : std::numeric_limits::lowest()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = false; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{-std::numeric_limits::infinity(), + -std::numeric_limits::infinity()}; +}; + +// Minimum + +template +using IsMinimum = std::bool_constant> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(std::numeric_limits::infinity()) + : std::numeric_limits::max()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = true; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; +}; + +// Plus + +template +using IsPlus = std::bool_constant> || + std::is_same_v>>; +// Multiplies + +template +using IsMultiplies = + std::bool_constant> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = static_cast(1); +}; + +// Identity + +template struct Identity +{ +}; + +template +using UseBuiltInIdentity = + std::conjunction, sycl::has_known_identity>; + +template +struct Identity::value>> +{ + static constexpr T value = GetIdentity::value; +}; + +template +struct Identity::value>> +{ + static constexpr T value = sycl::known_identity::value; +}; + } // namespace sycl_utils } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp new file mode 100644 index 0000000000..c67fcd5ba3 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp @@ -0,0 +1,514 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; +// Max +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_max_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MaxOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + +// Min +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_min_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MinOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + +// Sum +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_sum_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + +// Product +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_prod_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::ProductOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + +// Argmax +namespace impl +{ + +using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; +static search_reduction_strided_impl_fn_ptr + argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmax_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgmaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table); +} + +} // namespace impl + +// Argmin +namespace impl +{ + +using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; +static search_reduction_strided_impl_fn_ptr + argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmin_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgminOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table); +} + +} // namespace impl + +namespace py = pybind11; + +void init_reduction_functions(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + namespace impl = dpctl::tensor::py_internal::impl; + + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + using dpctl::tensor::py_internal::py_reduction_over_axis; + + using dpctl::tensor::py_internal::check_atomic_support; + using dpctl::tensor::py_internal::fixed_decision; + + // MAX + { + using dpctl::tensor::py_internal::impl:: + populate_max_over_axis_dispatch_tables; + populate_max_over_axis_dispatch_tables(); + using impl::max_over_axis0_contig_atomic_dispatch_table; + using impl::max_over_axis1_contig_atomic_dispatch_table; + using impl::max_over_axis_strided_atomic_dispatch_table; + using impl::max_over_axis_strided_temps_dispatch_table; + + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + + auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + max_over_axis_strided_atomic_dispatch_table, + max_over_axis_strided_temps_dispatch_table, + max_over_axis0_contig_atomic_dispatch_table, + max_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); + }; + m.def("_max_over_axis", max_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } + + // MIN + { + using dpctl::tensor::py_internal::impl:: + populate_min_over_axis_dispatch_tables; + populate_min_over_axis_dispatch_tables(); + using impl::min_over_axis0_contig_atomic_dispatch_table; + using impl::min_over_axis1_contig_atomic_dispatch_table; + using impl::min_over_axis_strided_atomic_dispatch_table; + using impl::min_over_axis_strided_temps_dispatch_table; + + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + + auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + min_over_axis_strided_atomic_dispatch_table, + min_over_axis_strided_temps_dispatch_table, + min_over_axis0_contig_atomic_dispatch_table, + min_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); + }; + m.def("_min_over_axis", min_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } + + // SUM + { + using dpctl::tensor::py_internal::impl:: + populate_sum_over_axis_dispatch_tables; + populate_sum_over_axis_dispatch_tables(); + using impl::sum_over_axis0_contig_atomic_dispatch_table; + using impl::sum_over_axis1_contig_atomic_dispatch_table; + using impl::sum_over_axis_strided_atomic_dispatch_table; + using impl::sum_over_axis_strided_temps_dispatch_table; + + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + + auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_over_axis0_contig_atomic_dispatch_table, + sum_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); + }; + m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sum_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); + }; + m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } + + // PROD + { + using dpctl::tensor::py_internal::impl:: + populate_prod_over_axis_dispatch_tables; + populate_prod_over_axis_dispatch_tables(); + using impl::prod_over_axis0_contig_atomic_dispatch_table; + using impl::prod_over_axis1_contig_atomic_dispatch_table; + using impl::prod_over_axis_strided_atomic_dispatch_table; + using impl::prod_over_axis_strided_temps_dispatch_table; + + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + + auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_over_axis0_contig_atomic_dispatch_table, + prod_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); + }; + m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto prod_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); + }; + m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } + + // ARGMAX + { + using dpctl::tensor::py_internal::impl:: + populate_argmax_over_axis_dispatch_tables; + populate_argmax_over_axis_dispatch_tables(); + using impl::argmax_over_axis_strided_temps_dispatch_table; + + auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmax_over_axis_strided_temps_dispatch_table); + }; + m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } + + // ARGMIN + { + using dpctl::tensor::py_internal::impl:: + populate_argmin_over_axis_dispatch_tables; + populate_argmin_over_axis_dispatch_tables(); + using impl::argmin_over_axis_strided_temps_dispatch_table; + + auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmin_over_axis_strided_temps_dispatch_table); + }; + m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp similarity index 57% rename from dpctl/tensor/libtensor/source/sum_reductions.cpp rename to dpctl/tensor/libtensor/source/reduction_over_axis.hpp index 529096f5b6..1a9cb6f5e7 100644 --- a/dpctl/tensor/libtensor/source/sum_reductions.cpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp @@ -1,8 +1,8 @@ -//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,16 +16,19 @@ // See the License for the specific language governing permissions and // limitations under the License. // -//===--------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// /// /// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for reductions. +//===----------------------------------------------------------------------===// + +#pragma once #include #include -#include -#include +#include +#include #include #include @@ -35,8 +38,6 @@ #include #include "kernels/reductions.hpp" -#include "sum_reductions.hpp" - #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" @@ -49,14 +50,15 @@ namespace tensor namespace py_internal { +template bool check_atomic_support(const sycl::queue &exec_q, - sycl::usm::alloc usm_alloc_type, - bool require_atomic64 = false) + sycl::usm::alloc usm_alloc_type) { bool supports_atomics = false; const sycl::device &dev = exec_q.get_device(); - if (require_atomic64) { + + if constexpr (require_atomic64) { if (!dev.has(sycl::aspect::atomic64)) return false; } @@ -78,28 +80,106 @@ bool check_atomic_support(const sycl::queue &exec_q, return supports_atomics; } -using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; -static sum_reduction_strided_impl_fn_ptr - sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static sum_reduction_strided_impl_fn_ptr - sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr; -static sum_reduction_contig_impl_fn_ptr - sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static sum_reduction_contig_impl_fn_ptr - sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -std::pair py_sum_over_axis( +template +bool fixed_decision(const sycl::queue &, sycl::usm::alloc) +{ + return return_value; +} + +/* ====================== dtype supported ======================== */ + +template +bool py_reduction_dtype_supported( + const py::dtype &input_dtype, + const py::dtype &output_dtype, + const std::string &dst_usm_type, + sycl::queue &q, + const fnT &atomic_dispatch_table, + const fnT &temps_dispatch_table, + const CheckAtomicSupportFnT &check_atomic_support_size4, + const CheckAtomicSupportFnT &check_atomic_support_size8) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + sycl::usm::alloc kind = sycl::usm::alloc::unknown; + + if (dst_usm_type == "device") { + kind = sycl::usm::alloc::device; + } + else if (dst_usm_type == "shared") { + kind = sycl::usm::alloc::shared; + } + else if (dst_usm_type == "host") { + kind = sycl::usm::alloc::host; + } + else { + throw py::value_error("Unrecognized `dst_usm_type` argument."); + } + + bool supports_atomics = false; + + switch (output_dtype.itemsize()) { + case sizeof(float): + { + supports_atomics = check_atomic_support_size4(q, kind); + } break; + case sizeof(double): + { + supports_atomics = check_atomic_support_size8(q, kind); + } break; + } + + if (supports_atomics) { + fn = atomic_dispatch_table[arg_typeid][out_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[arg_typeid][out_typeid]; + } + + return (fn != nullptr); +} + +/* ==================== Generic reductions ====================== */ + +template +std::pair py_reduction_over_axis( const dpctl::tensor::usm_ndarray &src, - int trailing_dims_to_reduce, // sum over this many trailing indexes + int trailing_dims_to_reduce, // comp over this many trailing indexes const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends) + const std::vector &depends, + const strided_fnT &atomic_dispatch_table, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_dispatch_table, + const contig_fnT &axis1_dispatch_table, + const SupportAtomicFnT &check_atomic_support_size4, + const SupportAtomicFnT &check_atomic_support_size8) { int src_nd = src.get_ndim(); int iteration_nd = src_nd - trailing_dims_to_reduce; @@ -160,6 +240,7 @@ std::pair py_sum_over_axis( int src_typenum = src.get_typenum(); int dst_typenum = dst.get_typenum(); + namespace td_ns = dpctl::tensor::type_dispatch; const auto &array_types = td_ns::usm_ndarray_types(); int src_typeid = array_types.typenum_to_lookup_id(src_typenum); int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); @@ -173,7 +254,7 @@ std::pair py_sum_over_axis( void *data_ptr = dst.get_data(); const auto &ctx = exec_q.get_context(); auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - supports_atomics = check_atomic_support(exec_q, usm_type); + supports_atomics = check_atomic_support_size4(exec_q, usm_type); } break; case sizeof(double): { @@ -181,9 +262,7 @@ std::pair py_sum_over_axis( const auto &ctx = exec_q.get_context(); auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - constexpr bool require_atomic64 = true; - supports_atomics = - check_atomic_support(exec_q, usm_type, require_atomic64); + supports_atomics = check_atomic_support_size8(exec_q, usm_type); } break; } @@ -197,14 +276,14 @@ std::pair py_sum_over_axis( if ((is_src_c_contig && is_dst_c_contig) || (is_src_f_contig && dst_nelems == 1)) { - auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; + auto fn = axis1_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; - sycl::event sum_over_axis_contig_ev = + sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), dst.get_data(), zero_offset, // iteration_src_offset @@ -213,22 +292,22 @@ std::pair py_sum_over_axis( depends); sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis_contig_ev}); + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); - return std::make_pair(keep_args_event, sum_over_axis_contig_ev); + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); } } else if (is_src_f_contig && ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) { - auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; + auto fn = axis0_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { size_t iter_nelems = dst_nelems; constexpr py::ssize_t zero_offset = 0; - sycl::event sum_over_axis_contig_ev = + sycl::event reduction_over_axis_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), dst.get_data(), zero_offset, // iteration_src_offset @@ -237,9 +316,10 @@ std::pair py_sum_over_axis( depends); sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis_contig_ev}); + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); - return std::make_pair(keep_args_event, sum_over_axis_contig_ev); + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); } } } @@ -320,50 +400,49 @@ std::pair py_sum_over_axis( } if (mat_reduce_over_axis1 || array_reduce_all_elems) { - auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; + auto fn = axis1_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { - sycl::event sum_over_axis1_contig_ev = + sycl::event reduction_over_axis1_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), dst.get_data(), iteration_src_offset, iteration_dst_offset, reduction_src_offset, depends); sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis1_contig_ev}); + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); return std::make_pair(keep_args_event, - sum_over_axis1_contig_ev); + reduction_over_axis1_contig_ev); } } else if (mat_reduce_over_axis0) { - auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; + auto fn = axis0_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { - sycl::event sum_over_axis0_contig_ev = + sycl::event reduction_over_axis0_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), dst.get_data(), iteration_src_offset, iteration_dst_offset, reduction_src_offset, depends); sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis0_contig_ev}); + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); return std::make_pair(keep_args_event, - sum_over_axis0_contig_ev); + reduction_over_axis0_contig_ev); } } } - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - sum_reduction_strided_impl_fn_ptr fn = nullptr; + // remove_all_extents gets underlying type of table + using strided_fn_ptr_T = + typename std::remove_all_extents::type; + strided_fn_ptr_T fn = nullptr; if (supports_atomics) { - fn = - sum_over_axis_strided_atomic_dispatch_table[src_typeid][dst_typeid]; + fn = atomic_dispatch_table[src_typeid][dst_typeid]; } if (fn == nullptr) { // use slower reduction implementation using temporaries - fn = sum_over_axis_strided_temps_dispatch_table[src_typeid][dst_typeid]; + fn = temps_dispatch_table[src_typeid][dst_typeid]; if (fn == nullptr) { throw std::runtime_error("Datatypes are not supported"); } @@ -398,14 +477,15 @@ std::pair py_sum_over_axis( std::copy(depends.begin(), depends.end(), all_deps.begin()); all_deps.push_back(copy_metadata_ev); - auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_nd, iter_shape_and_strides, - iteration_src_offset, iteration_dst_offset, - reduction_nd, // number dimensions being reduced - reduction_shape_stride, reduction_src_offset, all_deps); + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); + cgh.depends_on(reduction_ev); const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp_allocation_ptr] { sycl::free(temp_allocation_ptr, ctx); @@ -416,127 +496,194 @@ std::pair py_sum_over_axis( sycl::event keep_args_event = dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); - return std::make_pair(keep_args_event, comp_ev); + return std::make_pair(keep_args_event, reduction_ev); } -bool py_sum_over_axis_dtype_supported(const py::dtype &input_dtype, - const py::dtype &output_dtype, - const std::string &dst_usm_type, - sycl::queue &q) -{ - int arg_tn = - input_dtype.num(); // NumPy type numbers are the same as in dpctl - int out_tn = - output_dtype.num(); // NumPy type numbers are the same as in dpctl - int arg_typeid = -1; - int out_typeid = -1; - - auto array_types = td_ns::usm_ndarray_types(); +/* ==================== Search reductions ====================== */ - try { - arg_typeid = array_types.typenum_to_lookup_id(arg_tn); - out_typeid = array_types.typenum_to_lookup_id(out_tn); - } catch (const std::exception &e) { - throw py::value_error(e.what()); +template +std::pair py_search_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const fn_tableT &dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); } - if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || - out_typeid >= td_ns::num_types) - { - throw std::runtime_error("Reduction type support check: lookup failed"); + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); } - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - sum_reduction_strided_impl_fn_ptr fn = nullptr; - - sycl::usm::alloc kind = sycl::usm::alloc::unknown; + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); - if (dst_usm_type == "device") { - kind = sycl::usm::alloc::device; + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); } - else if (dst_usm_type == "shared") { - kind = sycl::usm::alloc::shared; + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); } - else if (dst_usm_type == "host") { - kind = sycl::usm::alloc::host; + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); } - else { - throw py::value_error("Unrecognized `dst_usm_type` argument."); + + size_t dst_nelems = dst.get_size(); + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); } - bool supports_atomics = false; + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } - switch (output_dtype.itemsize()) { - case sizeof(float): - { - supports_atomics = check_atomic_support(q, kind); - } break; - case sizeof(double): + // destination must be ample enough to accommodate all elements { - constexpr bool require_atomic64 = true; - supports_atomics = check_atomic_support(q, kind, require_atomic64); - } break; + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } } - if (supports_atomics) { - fn = - sum_over_axis_strided_atomic_dispatch_table[arg_typeid][out_typeid]; + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT compact_reduction_shape; + shT compact_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + compact_iteration_space( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + compact_reduction_shape, compact_reduction_src_strides); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); } + auto fn = dispatch_table[src_typeid][dst_typeid]; if (fn == nullptr) { - // use slower reduction implementation using temporaries - fn = sum_over_axis_strided_temps_dispatch_table[arg_typeid][out_typeid]; + throw std::runtime_error("Datatypes are not supported"); } - return (fn != nullptr); -} + std::vector host_task_events{}; -void populate_sum_over_axis_dispatch_table(void) -{ - using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - using namespace td_ns; - - using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); -} + using dpctl::tensor::offset_utils::device_allocate_and_pack; -namespace py = pybind11; + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + compact_reduction_shape, compact_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); -void init_reduction_functions(py::module_ m) -{ - populate_sum_over_axis_dispatch_table(); + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_nd, iter_shape_and_strides, + iteration_src_offset, iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); - m.def("_sum_over_axis", &py_sum_over_axis, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); - m.def("_sum_over_axis_dtype_supported", &py_sum_over_axis_dtype_supported, - "", py::arg("arg_dtype"), py::arg("out_dtype"), - py::arg("dst_usm_type"), py::arg("sycl_queue")); + return std::make_pair(keep_args_event, comp_ev); } +extern void init_reduction_functions(py::module_ m); + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sum_reductions.hpp b/dpctl/tensor/libtensor/source/sum_reductions.hpp deleted file mode 100644 index ac612ec1f7..0000000000 --- a/dpctl/tensor/libtensor/source/sum_reductions.hpp +++ /dev/null @@ -1,40 +0,0 @@ -//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2022 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#pragma once -#include -#include - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -extern void init_reduction_functions(py::module_ m); - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 2ce7c72add..6bd0649c1f 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -46,9 +46,9 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" +#include "reduction_over_axis.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" -#include "sum_reductions.hpp" #include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index 403a823324..dc647febf7 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import pytest import dpctl.tensor as dpt @@ -36,7 +35,6 @@ "c8", "c16", ] -_usm_types = ["device", "shared", "host"] @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -56,11 +54,11 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype): assert r.dtype.kind == "f" elif m.dtype.kind == "c": assert r.dtype.kind == "c" - assert (dpt.asnumpy(r) == 100).all() + assert dpt.all(r == 100) m = dpt.ones(200, dtype=arg_dtype)[:1:-2] r = dpt.sum(m) - assert (dpt.asnumpy(r) == 99).all() + assert dpt.all(r == 99) @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -75,7 +73,7 @@ def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype): assert isinstance(r, dpt.usm_ndarray) assert r.dtype == dpt.dtype(out_dtype) - assert (dpt.asnumpy(r) == 100).all() + assert dpt.all(r == 100) def test_sum_empty(): @@ -94,7 +92,7 @@ def test_sum_axis(): assert isinstance(s, dpt.usm_ndarray) assert s.shape == (3, 6) - assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all() + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4")) def test_sum_keepdims(): @@ -105,7 +103,7 @@ def test_sum_keepdims(): assert isinstance(s, dpt.usm_ndarray) assert s.shape == (3, 1, 1, 6, 1) - assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all() + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype)) def test_sum_scalar(): @@ -117,7 +115,7 @@ def test_sum_scalar(): assert isinstance(s, dpt.usm_ndarray) assert m.sycl_queue == s.sycl_queue assert s.shape == () - assert dpt.asnumpy(s) == np.full((), 1) + assert s == dpt.full((), 1) @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -132,7 +130,7 @@ def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype): assert isinstance(r, dpt.usm_ndarray) assert r.dtype == dpt.dtype(out_dtype) - assert dpt.asnumpy(r) == 1 + assert r == 1 def test_sum_keepdims_zero_size(): @@ -187,3 +185,66 @@ def test_axis0_bug(): expected = dpt.asarray([[0, 3], [1, 4], [2, 5]]) assert dpt.all(s == expected) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:]) +def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m) + + assert isinstance(r, dpt.usm_ndarray) + if m.dtype.kind == "i": + assert r.dtype.kind == "i" + elif m.dtype.kind == "u": + assert r.dtype.kind == "u" + elif m.dtype.kind == "f": + assert r.dtype.kind == "f" + elif m.dtype.kind == "c": + assert r.dtype.kind == "c" + assert dpt.all(r == 1) + + if dpt.isdtype(m.dtype, "unsigned integer"): + m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(512, dtype=r.dtype)) + else: + m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype)) + + +def test_prod_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="u1") + y = dpt.prod(x) + assert y.shape == tuple() + assert int(y) == 1 + + +def test_prod_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.prod(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.all(s == dpt.asarray(1, dtype="i4")) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + assert dpt.all(r == 1) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py new file mode 100644 index 0000000000..8d66f35d71 --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -0,0 +1,236 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from random import randrange + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + + +def test_max_min_axis(): + get_queue_or_skip() + + x = dpt.reshape( + dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7) + ) + + m = dpt.max(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, -1, -1, :, -1]) + + m = dpt.min(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, 0, 0, :, 0]) + + +def test_reduction_keepdims(): + get_queue_or_skip() + + n0, n1 = 3, 6 + x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4") + m = dpt.max(x, axis=(1, 2, -1), keepdims=True) + + xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1)) + p = dpt.argmax(xx, axis=-1, keepdims=True) + + assert m.shape == (n0, 1, 1, n1, 1) + assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape)) + assert dpt.all(p == 0) + + +def test_max_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.max(x) + + assert m.shape == () + assert x == m + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 3 + x[:, x.shape[1] // 2] = 3 + + m = dpt.max(x) + assert m == 3 + m = dpt.max(x, axis=0) + assert dpt.all(m == 3) + m = dpt.max(x, axis=1) + assert dpt.all(m == 3) + + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 0 + x[:, x.shape[1] // 2] = 0 + + m = dpt.min(x) + assert m == 0 + m = dpt.min(x, axis=0) + assert dpt.all(m == 0) + m = dpt.min(x, axis=1) + assert dpt.all(m == 0) + + +def test_max_min_nan_propagation(): + get_queue_or_skip() + + # float, finites + x = dpt.arange(4, dtype="f4") + x[0] = dpt.nan + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + # float, infinities + x[1:] = dpt.inf + assert dpt.isnan(dpt.max(x)) + x[1:] = -dpt.inf + assert dpt.isnan(dpt.min(x)) + + # complex + x = dpt.arange(4, dtype="c8") + x[0] = complex(dpt.nan, 0) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + x[0] = complex(0, dpt.nan) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + +def test_argmax_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.argmax(x) + + assert m.shape == () + assert m == 0 + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_search_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, (24, 1025)) + x[idx] = 2 + + m = dpt.argmax(x) + assert m == idx + + x = dpt.reshape(x, (24, 1025)) + + x[idx_tup[0], :] = 3 + m = dpt.argmax(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = 4 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = 5 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx) + + x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, (24, 1025)) + x[idx] = 0 + + m = dpt.argmin(x) + assert m == idx + + x = dpt.reshape(x, (24, 1025)) + + x[idx_tup[0], :] = -1 + m = dpt.argmin(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = -2 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = -3 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx) + + +def test_argmax_argmin_nan_propagation(): + get_queue_or_skip() + + sz = 4 + idx = randrange(sz) + # floats + x = dpt.arange(sz, dtype="f4") + x[idx] = dpt.nan + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + # complex + x = dpt.arange(sz, dtype="c8") + x[idx] = complex(dpt.nan, 0) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + x[idx] = complex(0, dpt.nan) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + +def test_argmax_argmin_identities(): + # make sure that identity arrays work as expected + get_queue_or_skip() + + x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4") + assert dpt.argmax(x) == 0 + x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4") + assert dpt.argmin(x) == 0 + + +def test_reduction_arg_validation(): + get_queue_or_skip() + + x = dict() + with pytest.raises(TypeError): + dpt.sum(x) + with pytest.raises(TypeError): + dpt.max(x) + with pytest.raises(TypeError): + dpt.argmax(x) + + x = dpt.zeros((0,), dtype="i4") + with pytest.raises(ValueError): + dpt.max(x) + with pytest.raises(ValueError): + dpt.argmax(x) From dfba4369971740e69ccb62cec67e23490b43398c Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Tue, 3 Oct 2023 19:23:30 -0700 Subject: [PATCH 04/83] repeat with `axis=None` repeats flattened array (#1427) * Implements flat overload for repeat Adds tests for new functionality * repeat `repeats` parameter relaxed to permit lists and ranges Docstring has been adjusted to reflect changes to `axis` as well as new `repeats` types Corrected a bug in the behavior of `repeat` for size 1 `repeats` Python sequences * Fixed repeat error syntax for `repeats array with ndim > 1 --- dpctl/tensor/_manipulation_functions.py | 82 ++-- .../libtensor/include/kernels/repeat.hpp | 88 +++-- dpctl/tensor/libtensor/source/repeat.cpp | 354 +++++++++++++++++- dpctl/tensor/libtensor/source/repeat.hpp | 15 + dpctl/tensor/libtensor/source/tensor_py.cpp | 39 +- dpctl/tests/test_usm_ndarray_manipulation.py | 24 +- 6 files changed, 503 insertions(+), 99 deletions(-) diff --git a/dpctl/tensor/_manipulation_functions.py b/dpctl/tensor/_manipulation_functions.py index 7201cd96fb..7135304b58 100644 --- a/dpctl/tensor/_manipulation_functions.py +++ b/dpctl/tensor/_manipulation_functions.py @@ -19,7 +19,6 @@ import operator import numpy as np -from numpy import AxisError from numpy.core.numeric import normalize_axis_index, normalize_axis_tuple import dpctl @@ -929,20 +928,26 @@ def repeat(x, repeats, axis=None): Args: x (usm_ndarray): input array - repeat (Union[int, Tuple[int, ...]]): + repeats (Union[int, Sequence[int, ...], usm_ndarray]): The number of repetitions for each element. - `repeats` is broadcasted to fit the shape of the given axis. + `repeats` is broadcast to fit the shape of the given axis. + If `repeats` is an array, it must have an integer data type. + Otherwise, `repeats` must be a Python integer, tuple, list, or + range. axis (Optional[int]): - The axis along which to repeat values. The `axis` is required - if input array has more than one dimension. + The axis along which to repeat values. If `axis` is `None`, the + function repeats elements of the flattened array. + Default: `None`. Returns: usm_narray: Array with repeated elements. - The returned array must have the same data type as `x`, - is created on the same device as `x` and has the same USM - allocation type as `x`. + The returned array must have the same data type as `x`, is created + on the same device as `x` and has the same USM allocation type as + `x`. If `axis` is `None`, the returned array is one-dimensional, + otherwise, it has the same shape as `x`, except for the axis along + which elements were repeated. Raises: AxisError: if `axis` value is invalid. @@ -951,20 +956,11 @@ def repeat(x, repeats, axis=None): raise TypeError(f"Expected usm_ndarray type, got {type(x)}.") x_ndim = x.ndim - if axis is None: - if x_ndim > 1: - raise ValueError( - f"`axis` cannot be `None` for array of dimension {x_ndim}" - ) - axis = 0 - x_shape = x.shape - if x_ndim > 0: + if axis is not None: axis = normalize_axis_index(operator.index(axis), x_ndim) axis_size = x_shape[axis] else: - if axis != 0: - AxisError("`axis` must be `0` for input of dimension `0`") axis_size = x.size scalar = False @@ -977,8 +973,8 @@ def repeat(x, repeats, axis=None): elif isinstance(repeats, dpt.usm_ndarray): if repeats.ndim > 1: raise ValueError( - "`repeats` array must be 0- or 1-dimensional, got" - "{repeats.ndim}" + "`repeats` array must be 0- or 1-dimensional, got " + f"{repeats.ndim}" ) exec_q = dpctl.utils.get_execution_queue( (x.sycl_queue, repeats.sycl_queue) @@ -1015,22 +1011,22 @@ def repeat(x, repeats, axis=None): if not dpt.all(repeats >= 0): raise ValueError("`repeats` elements must be positive") - elif isinstance(repeats, tuple): + elif isinstance(repeats, (tuple, list, range)): usm_type = x.usm_type exec_q = x.sycl_queue len_reps = len(repeats) - if len_reps != axis_size: - raise ValueError( - "`repeats` tuple must have the same length as the repeated " - "axis" - ) - elif len_reps == 1: + if len_reps == 1: repeats = repeats[0] if repeats < 0: raise ValueError("`repeats` elements must be positive") scalar = True else: + if len_reps != axis_size: + raise ValueError( + "`repeats` sequence must have the same length as the " + "repeated axis" + ) repeats = dpt.asarray( repeats, dtype=dpt.int64, usm_type=usm_type, sycl_queue=exec_q ) @@ -1038,7 +1034,7 @@ def repeat(x, repeats, axis=None): raise ValueError("`repeats` elements must be positive") else: raise TypeError( - "Expected int, tuple, or `usm_ndarray` for second argument," + "Expected int, sequence, or `usm_ndarray` for second argument," f"got {type(repeats)}" ) @@ -1047,7 +1043,10 @@ def repeat(x, repeats, axis=None): if scalar: res_axis_size = repeats * axis_size - res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + if axis is not None: + res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + else: + res_shape = (res_axis_size,) res = dpt.empty( res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q ) @@ -1081,9 +1080,17 @@ def repeat(x, repeats, axis=None): res_axis_size = ti._cumsum_1d( rep_buf, cumsum, sycl_queue=exec_q, depends=[copy_ev] ) - res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) res = dpt.empty( - res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, ) if res_axis_size > 0: ht_rep_ev, _ = ti._repeat_by_sequence( @@ -1103,11 +1110,18 @@ def repeat(x, repeats, axis=None): usm_type=usm_type, sycl_queue=exec_q, ) - # _cumsum_1d synchronizes so `depends` ends here safely res_axis_size = ti._cumsum_1d(repeats, cumsum, sycl_queue=exec_q) - res_shape = x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + if axis is not None: + res_shape = ( + x_shape[:axis] + (res_axis_size,) + x_shape[axis + 1 :] + ) + else: + res_shape = (res_axis_size,) res = dpt.empty( - res_shape, dtype=x.dtype, usm_type=usm_type, sycl_queue=exec_q + res_shape, + dtype=x.dtype, + usm_type=usm_type, + sycl_queue=exec_q, ) if res_axis_size > 0: ht_rep_ev, _ = ti._repeat_by_sequence( diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp index da1989fc3c..1f2335fc6c 100644 --- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp +++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp @@ -46,14 +46,16 @@ namespace py = pybind11; using namespace dpctl::tensor::offset_utils; template class repeat_by_sequence_kernel; template @@ -66,8 +68,8 @@ class RepeatSequenceFunctor const repT *cumsum = nullptr; size_t src_axis_nelems = 1; OrthogIndexer orthog_strider; - AxisIndexer src_axis_strider; - AxisIndexer dst_axis_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; RepIndexer reps_strider; public: @@ -77,8 +79,8 @@ class RepeatSequenceFunctor const repT *cumsum_, size_t src_axis_nelems_, OrthogIndexer orthog_strider_, - AxisIndexer src_axis_strider_, - AxisIndexer dst_axis_strider_, + SrcAxisIndexer src_axis_strider_, + DstAxisIndexer dst_axis_strider_, RepIndexer reps_strider_) : src(src_), dst(dst_), reps(reps_), cumsum(cumsum_), src_axis_nelems(src_axis_nelems_), orthog_strider(orthog_strider_), @@ -167,12 +169,12 @@ repeat_by_sequence_impl(sycl::queue &q, const size_t gws = orthog_nelems * src_axis_nelems; - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>(gws), RepeatSequenceFunctor( + Strided1DIndexer, Strided1DIndexer, T, repT>( src_tp, dst_tp, reps_tp, cumsum_tp, src_axis_nelems, orthog_indexer, src_axis_indexer, dst_axis_indexer, reps_indexer)); @@ -197,8 +199,8 @@ typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( char *, const char *, const char *, - py::ssize_t, - py::ssize_t, + int, + const py::ssize_t *, py::ssize_t, py::ssize_t, py::ssize_t, @@ -212,8 +214,8 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, char *dst_cp, const char *reps_cp, const char *cumsum_cp, - py::ssize_t src_shape, - py::ssize_t src_stride, + int src_nd, + const py::ssize_t *src_shape_strides, py::ssize_t dst_shape, py::ssize_t dst_stride, py::ssize_t reps_shape, @@ -231,19 +233,19 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, // orthog ndim indexer TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis - Strided1DIndexer src_indexer{0, src_shape, src_stride}; + StridedIndexer src_indexer{src_nd, 0, src_shape_strides}; Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; // indexer along reps array Strided1DIndexer reps_indexer{0, reps_shape, reps_stride}; const size_t gws = src_nelems; - cgh.parallel_for< - repeat_by_sequence_kernel>( + cgh.parallel_for>( sycl::range<1>(gws), - RepeatSequenceFunctor( + RepeatSequenceFunctor( src_tp, dst_tp, reps_tp, cumsum_tp, src_nelems, orthog_indexer, src_indexer, dst_indexer, reps_indexer)); }); @@ -260,10 +262,16 @@ template struct RepeatSequence1DFactory } }; -template +template class repeat_by_scalar_kernel; -template +template class RepeatScalarFunctor { private: @@ -272,8 +280,8 @@ class RepeatScalarFunctor const py::ssize_t reps = 1; size_t dst_axis_nelems = 0; OrthogIndexer orthog_strider; - AxisIndexer src_axis_strider; - AxisIndexer dst_axis_strider; + SrcAxisIndexer src_axis_strider; + DstAxisIndexer dst_axis_strider; public: RepeatScalarFunctor(const T *src_, @@ -281,8 +289,8 @@ class RepeatScalarFunctor const py::ssize_t reps_, size_t dst_axis_nelems_, OrthogIndexer orthog_strider_, - AxisIndexer src_axis_strider_, - AxisIndexer dst_axis_strider_) + SrcAxisIndexer src_axis_strider_, + DstAxisIndexer dst_axis_strider_) : src(src_), dst(dst_), reps(reps_), dst_axis_nelems(dst_axis_nelems_), orthog_strider(orthog_strider_), src_axis_strider(src_axis_strider_), dst_axis_strider(dst_axis_strider_) @@ -354,10 +362,11 @@ sycl::event repeat_by_scalar_impl(sycl::queue &q, const size_t gws = orthog_nelems * dst_axis_nelems; - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>(gws), - RepeatScalarFunctor( + RepeatScalarFunctor( src_tp, dst_tp, reps, dst_axis_nelems, orthog_indexer, src_axis_indexer, dst_axis_indexer)); }); @@ -380,8 +389,8 @@ typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( const char *, char *, const py::ssize_t, - py::ssize_t, - py::ssize_t, + int, + const py::ssize_t *, py::ssize_t, py::ssize_t, const std::vector &); @@ -392,8 +401,8 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, const char *src_cp, char *dst_cp, const py::ssize_t reps, - py::ssize_t src_shape, - py::ssize_t src_stride, + int src_nd, + const py::ssize_t *src_shape_strides, py::ssize_t dst_shape, py::ssize_t dst_stride, const std::vector &depends) @@ -407,17 +416,18 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, // orthog ndim indexer TwoZeroOffsets_Indexer orthog_indexer{}; // indexers along repeated axis - Strided1DIndexer src_indexer(0, src_shape, src_stride); + StridedIndexer src_indexer(src_nd, 0, src_shape_strides); Strided1DIndexer dst_indexer{0, dst_shape, dst_stride}; const size_t gws = dst_nelems; - cgh.parallel_for>( + cgh.parallel_for>( sycl::range<1>(gws), - RepeatScalarFunctor( - src_tp, dst_tp, reps, dst_nelems, orthog_indexer, src_indexer, - dst_indexer)); + RepeatScalarFunctor(src_tp, dst_tp, reps, + dst_nelems, orthog_indexer, + src_indexer, dst_indexer)); }); return repeat_ev; diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index 0dbfb17a5d..3b1c956dd4 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -237,18 +237,37 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, assert(dst_shape_vec.size() == 1); assert(dst_strides_vec.size() == 1); - py::ssize_t src_shape(0); - py::ssize_t src_stride(0); - if (src_nd > 0) { - src_shape = src_shape_vec[0]; - src_stride = src_strides_vec[0]; + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = + device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = + std::get<0>(ptr_size_event_tuple1); + if (packed_src_shape_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); - sycl::event repeat_ev = + repeat_ev = fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p, - cumsum_data_p, src_shape, src_stride, dst_shape_vec[0], - dst_strides_vec[0], reps_shape_vec[0], reps_strides_vec[0], - depends); + cumsum_data_p, src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0], + reps_strides_vec[0], depends); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); } else { // non-empty othogonal directions @@ -343,6 +362,162 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, return std::make_pair(py_obj_management_host_task_ev, repeat_ev); } +std::pair +py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends) +{ + + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + int reps_nd = reps.get_ndim(); + if (reps_nd != 1) { + throw py::value_error("`reps` array must be 1-dimensional"); + } + + if (cumsum.get_ndim() != 1) { + throw py::value_error("`cumsum` array must be 1-dimensional."); + } + + if (!cumsum.is_c_contiguous()) { + throw py::value_error("Expecting `cumsum` array to be C-contiguous."); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, reps, cumsum, dst})) + { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t src_sz = src.get_size(); + size_t reps_sz = reps.get_size(); + size_t cumsum_sz = cumsum.get_size(); + + // shape at repeated axis must be equal to the sum of reps + if (src_sz != reps_sz || src_sz != cumsum_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(dst.get_size())) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src, cumsum, or reps + if (overlap(dst, src) || overlap(dst, reps) || overlap(dst, cumsum)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + int reps_typenum = reps.get_typenum(); + int cumsum_typenum = cumsum.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + int reps_typeid = array_types.typenum_to_lookup_id(reps_typenum); + int cumsum_typeid = array_types.typenum_to_lookup_id(cumsum_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + constexpr int int64_typeid = static_cast(td_ns::typenum_t::INT64); + if (cumsum_typeid != int64_typeid) { + throw py::value_error( + "Unexpected data type of `cumsum` array, expecting " + "'int64'"); + } + + if (reps_typeid != cumsum_typeid) { + throw py::value_error("`reps` array must have the same elemental " + "data type as cumsum"); + } + + const char *src_data_p = src.get_data(); + const char *reps_data_p = reps.get_data(); + const char *cumsum_data_p = cumsum.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + auto reps_shape_vec = reps.get_shape_vector(); + auto reps_strides_vec = reps.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_sequence_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shapes_strides = std::get<0>(ptr_size_event_tuple1); + if (packed_src_shapes_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = + fn(exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p, + src_nd, packed_src_shapes_strides, dst_shape_vec[0], + dst_strides_vec[0], reps_shape_vec[0], reps_strides_vec[0], depends); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shapes_strides] { + sycl::free(packed_src_shapes_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); + host_task_events.push_back(repeat_ev); + + sycl::event py_obj_management_host_task_ev = dpctl::utils::keep_args_alive( + exec_q, {src, reps, cumsum, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + std::pair py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, const dpctl::tensor::usm_ndarray &dst, @@ -452,15 +627,42 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, assert(dst_shape_vec.size() == 1); assert(dst_strides_vec.size() == 1); - py::ssize_t src_shape(0); - py::ssize_t src_stride(0); if (src_nd > 0) { - src_shape = src_shape_vec[0]; - src_stride = src_strides_vec[0]; + src_shape_vec = {0}; + src_strides_vec = {0}; } - sycl::event repeat_ev = - fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, src_shape, - src_stride, dst_shape_vec[0], dst_strides_vec[0], depends); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = + device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = + std::get<0>(ptr_size_event_tuple1); + if (packed_src_shape_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, dst_shape_vec[0], + dst_strides_vec[0], depends); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + host_task_events.push_back(cleanup_tmp_allocations_ev); } else { // non-empty othogonal directions @@ -554,6 +756,126 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, return std::make_pair(py_obj_management_host_task_ev, repeat_ev); } +std::pair +py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends) +{ + int dst_nd = dst.get_ndim(); + if (dst_nd != 1) { + throw py::value_error( + "`dst` array must be 1-dimensional when repeating a full array"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t src_sz = src.get_size(); + size_t dst_sz = dst.get_size(); + + // shape at repeated axis must be equal to the shape of src at the axis * + // reps + if ((src_sz * reps) != dst_sz) { + throw py::value_error("Inconsistent array dimensions"); + } + + if (src_sz == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(src_sz * reps)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + // check that dst does not intersect with src + if (overlap(dst, src)) { + throw py::value_error("Destination array overlaps with inputs"); + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid) { + throw py::value_error( + "Destination array must have the same elemental data type"); + } + + const char *src_data_p = src.get_data(); + char *dst_data_p = dst.get_data(); + + int src_nd = src.get_ndim(); + auto src_shape_vec = src.get_shape_vector(); + auto src_strides_vec = src.get_strides_vector(); + + if (src_nd == 0) { + src_shape_vec = {0}; + src_strides_vec = {0}; + } + + auto dst_shape_vec = dst.get_shape_vector(); + auto dst_strides_vec = dst.get_strides_vector(); + + std::vector host_task_events{}; + + auto fn = repeat_by_scalar_1d_dispatch_vector[src_typeid]; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &ptr_size_event_tuple1 = device_allocate_and_pack( + exec_q, host_task_events, src_shape_vec, src_strides_vec); + py::ssize_t *packed_src_shape_strides = std::get<0>(ptr_size_event_tuple1); + if (packed_src_shape_strides == nullptr) { + throw std::runtime_error("Unable to allocate device memory"); + } + sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps, + src_nd, packed_src_shape_strides, + dst_shape_vec[0], dst_strides_vec[0], depends); + + sycl::event cleanup_tmp_allocations_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(repeat_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, packed_src_shape_strides] { + sycl::free(packed_src_shape_strides, ctx); + }); + }); + + host_task_events.push_back(cleanup_tmp_allocations_ev); + host_task_events.push_back(repeat_ev); + + sycl::event py_obj_management_host_task_ev = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(py_obj_management_host_task_ev, repeat_ev); +} + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/repeat.hpp b/dpctl/tensor/libtensor/source/repeat.hpp index 87fb0a0847..65ace36516 100644 --- a/dpctl/tensor/libtensor/source/repeat.hpp +++ b/dpctl/tensor/libtensor/source/repeat.hpp @@ -48,6 +48,14 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, sycl::queue &exec_q, const std::vector &depends); +extern std::pair +py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + sycl::queue &exec_q, + const std::vector &depends); + extern std::pair py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, const dpctl::tensor::usm_ndarray &dst, @@ -56,6 +64,13 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, sycl::queue &exec_q, const std::vector &depends); +extern std::pair +py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, + sycl::queue &exec_q, + const std::vector &depends); + } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 6bd0649c1f..0e8b4236b6 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include "dpctl4pybind11.hpp" @@ -402,13 +403,43 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("x2"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_repeat_by_sequence", &py_repeat_by_sequence, "", py::arg("src"), + auto repeat_sequence = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const dpctl::tensor::usm_ndarray &reps, + const dpctl::tensor::usm_ndarray &cumsum, + std::optional axis, sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_sequence(src, dst, reps, cumsum, axis.value(), + exec_q, depends); + } + else { + return py_repeat_by_sequence(src, dst, reps, cumsum, exec_q, + depends); + } + }; + m.def("_repeat_by_sequence", repeat_sequence, py::arg("src"), py::arg("dst"), py::arg("reps"), py::arg("cumsum"), py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_repeat_by_scalar", &py_repeat_by_scalar, "", py::arg("src"), - py::arg("dst"), py::arg("reps"), py::arg("axis"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); + auto repeat_scalar = [](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, + const py::ssize_t reps, std::optional axis, + sycl::queue &exec_q, + const std::vector depends) + -> std::pair { + if (axis) { + return py_repeat_by_scalar(src, dst, reps, axis.value(), exec_q, + depends); + } + else { + return py_repeat_by_scalar(src, dst, reps, exec_q, depends); + } + }; + m.def("_repeat_by_scalar", repeat_scalar, py::arg("src"), py::arg("dst"), + py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); dpctl::tensor::py_internal::init_elementwise_functions(m); dpctl::tensor::py_internal::init_boolean_reduction_functions(m); diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py index 2126727d5b..ae32afdba9 100644 --- a/dpctl/tests/test_usm_ndarray_manipulation.py +++ b/dpctl/tests/test_usm_ndarray_manipulation.py @@ -1193,11 +1193,17 @@ def test_repeat_size_0_outputs(): assert res.size == 0 assert res.shape == (3, 0, 5) - x = dpt.ones((3, 2, 5)) res = dpt.repeat(x, (0, 0), axis=1) assert res.size == 0 assert res.shape == (3, 0, 5) + # axis=None cases + res = dpt.repeat(x, 0) + assert res.size == 0 + + res = dpt.repeat(x, (0,) * x.size) + assert res.size == 0 + def test_repeat_strides(): get_queue_or_skip() @@ -1220,6 +1226,17 @@ def test_repeat_strides(): res = dpt.repeat(x1, (reps,) * x1.shape[0], axis=0) assert dpt.all(res == expected_res) + # axis=None + x = dpt.reshape(dpt.arange(10 * 10), (10, 10)) + x1 = dpt.reshape(x[::-2, :], -1) + x2 = x[::-2, :] + expected_res = dpt.empty(10 * 10, dtype="i4") + expected_res[::2], expected_res[1::2] = x1, x1 + res = dpt.repeat(x2, reps) + assert dpt.all(res == expected_res) + res = dpt.repeat(x2, (reps,) * x1.size) + assert dpt.all(res == expected_res) + def test_repeat_casting(): get_queue_or_skip() @@ -1256,11 +1273,6 @@ def test_repeat_arg_validation(): with pytest.raises(ValueError): dpt.repeat(x, 2, axis=1) - # x.ndim cannot be > 1 for axis=None - x = dpt.empty((5, 10)) - with pytest.raises(ValueError): - dpt.repeat(x, 2, axis=None) - # repeats must be positive x = dpt.empty(5) with pytest.raises(ValueError): From 2c757d4eb746f241d0e15260d692d36b87b666f0 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 5 Oct 2023 16:48:59 -0500 Subject: [PATCH 05/83] Do apt-get update before installing gdb --- .github/workflows/conda-package.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 09806bcce9..e715c444c0 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -181,6 +181,7 @@ jobs: python -c "import dpctl; dpctl.lsplatform(verbosity=2)" - name: Install gdb run: | + sudo apt-get update --fix-missing sudo apt-get install -y gdb - name: Run test_elementwise under gdb run: | From 049cd77c931adcf05acb342698159aca9bc93088 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 4 Oct 2023 06:29:27 -0500 Subject: [PATCH 06/83] Added a comment line --- dpctl/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index a466d3eef1..795ca09c78 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -191,6 +191,7 @@ foreach(_cy_file ${_cython_sources}) build_dpctl_ext(${_trgt} ${_cy_file} "dpctl") endforeach() +# _sycl_queue include _host_task_util.hpp target_include_directories(_sycl_queue PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory(program) From f73a1c4c85f1bc3729f2fde8371563003817329a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 4 Oct 2023 06:30:22 -0500 Subject: [PATCH 07/83] Implement Python API for ext_intel_device_info descriptors https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_intel_device_info.md This includes HW characteristics for Intel Level-Zero GPU devices as well as access to PCI device-identifier. --- dpctl/utils/CMakeLists.txt | 21 +++++ dpctl/utils/__init__.py | 73 +++++++++++++++ dpctl/utils/src/device_queries.cpp | 139 +++++++++++++++++++++++++++++ 3 files changed, 233 insertions(+) create mode 100644 dpctl/utils/src/device_queries.cpp diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt index 11b0930052..8bc65e3056 100644 --- a/dpctl/utils/CMakeLists.txt +++ b/dpctl/utils/CMakeLists.txt @@ -4,3 +4,24 @@ foreach(_cy_file ${_cython_sources}) get_filename_component(_trgt ${_cy_file} NAME_WLE) build_dpctl_ext(${_trgt} ${_cy_file} "dpctl/utils") endforeach() + +add_custom_target(_dpctl4pybind11_header_ready + DEPENDS + _usmarray_copy_capi_include + _memory_copy_capi_include + _sycl_device_copy_capi_include + _sycl_queue_copy_capi_include + _sycl_context_copy_capi_include + _sycl_event_copy_capi_include +) + +set(python_module_name _device_queries) +pybind11_add_module(${python_module_name} MODULE + ${CMAKE_CURRENT_SOURCE_DIR}/src/device_queries.cpp +) +target_include_directories(${python_module_name} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include +) +add_dependencies(${python_module_name} _dpctl4pybind11_header_ready) +install(TARGETS ${python_module_name} DESTINATION "dpctl/utils") diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index 671564cda5..b2d29721e6 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -18,14 +18,87 @@ A collection of utility functions. """ +from .._sycl_device import SyclDevice from ._compute_follows_data import ( ExecutionPlacementError, get_coerced_usm_type, get_execution_queue, validate_usm_type, ) +from ._device_queries import ( + intel_device_info_device_id, + intel_device_info_gpu_eu_count, + intel_device_info_gpu_eu_count_per_subslice, + intel_device_info_gpu_eu_simd_width, + intel_device_info_gpu_hw_threads_per_eu, + intel_device_info_gpu_slices, + intel_device_info_gpu_subslices_per_slice, + intel_device_info_max_mem_bandwidth, +) from ._onetrace_context import onetrace_enabled + +def intel_device_info(dev): + """intel_device_info(sycl_device) + + For Intel(R) GPU devices returns a dictionary + with device architectural details, and an empty + dictionary otherwise. The dictionary contains + the following keys: + + device_id: 32-bits device PCI identifier + gpu_eu_count: Total number of execution units + gpu_hw_threads_per_eu: Number of thread contexts in EU + gpu_eu_simd_width: Physical SIMD width of EU + gpu_slices: Total number of slices + gpu_subslices_per_slice: Number of sub-slices per slice + gpu_eu_count_per_subslice: Number of EUs in subslice + max_mem_bandwidth: Maximum memory bandwidth in bytes/second + + Unsupported descriptors are omitted from the dictionary. + Descriptors other than PCI identifier are supported only for + SyclDevices with Leve-Zero backend. + """ + if not isinstance(dev, SyclDevice): + raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") + dev_id = intel_device_info_device_id(dev) + if dev_id: + res = { + "device_id": dev_id, + } + if dev.has_aspect_gpu: + eu_count = intel_device_info_gpu_eu_count(dev) + if eu_count: + res["gpu_eu_count"] = eu_count + hw_threads = intel_device_info_gpu_hw_threads_per_eu(dev) + if hw_threads: + res["gpu_hw_threads_per_eu"] = hw_threads + simd_w = intel_device_info_gpu_eu_simd_width(dev) + if simd_w: + res["gpu_eu_simd_width"] = simd_w + n_slices = intel_device_info_gpu_slices(dev) + if n_slices: + res["gpu_slices"] = n_slices + n_subslices = intel_device_info_gpu_subslices_per_slice(dev) + if n_subslices: + res["gpu_subslices_per_slice"] = n_subslices + n_eu_per_subslice = intel_device_info_gpu_eu_count_per_subslice(dev) + if n_eu_per_subslice: + res["gpu_eu_count_per_subslice"] = n_eu_per_subslice + bw = intel_device_info_max_mem_bandwidth(dev) + if bw: + res["max_mem_bandwidth"] = bw + return res + return dict() + + +def _is_gen9(dev): + if not isinstance(dev, SyclDevice): + raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") + dev_id = intel_device_info_device_id(dev) + return (dev_id & 0xFF00) == 0x3E00 + + __all__ = [ "get_execution_queue", "get_coerced_usm_type", diff --git a/dpctl/utils/src/device_queries.cpp b/dpctl/utils/src/device_queries.cpp new file mode 100644 index 0000000000..6407e69dbb --- /dev/null +++ b/dpctl/utils/src/device_queries.cpp @@ -0,0 +1,139 @@ +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include +#include + +namespace +{ + +std::uint32_t py_intel_device_id(const sycl::device &d) +{ + static constexpr std::uint32_t device_id_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_device_id)) { + return d.get_info(); + } + + return device_id_unavailable; +} + +std::uint32_t py_intel_gpu_eu_count(const sycl::device &d) +{ + static constexpr std::uint32_t eu_count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_count)) { + return d.get_info(); + } + + return eu_count_unavailable; +} + +std::uint32_t py_intel_gpu_hw_threads_per_eu(const sycl::device &d) +{ + static constexpr std::uint32_t thread_count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) { + return d + .get_info(); + } + + return thread_count_unavailable; +} + +std::uint32_t py_intel_gpu_eu_simd_width(const sycl::device &d) +{ + static constexpr std::uint32_t width_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_simd_width)) { + return d.get_info(); + } + + return width_unavailable; +} + +std::uint32_t py_intel_gpu_slices(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_slices)) { + return d.get_info(); + } + + return count_unavailable; +} + +std::uint32_t py_intel_gpu_subslices_per_slice(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) { + return d.get_info< + sycl::ext::intel::info::device::gpu_subslices_per_slice>(); + } + + return count_unavailable; +} + +std::uint32_t py_intel_gpu_eu_count_per_subslice(const sycl::device &d) +{ + static constexpr std::uint32_t count_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice)) { + return d.get_info< + sycl::ext::intel::info::device::gpu_eu_count_per_subslice>(); + } + + return count_unavailable; +} + +std::uint64_t py_intel_max_mem_bandwidth(const sycl::device &d) +{ + static constexpr std::uint64_t bandwidth_unavailable = 0; + + if (d.has(sycl::aspect::ext_intel_max_mem_bandwidth)) { + return d.get_info(); + } + + return bandwidth_unavailable; +} + +}; // namespace + +PYBIND11_MODULE(_device_queries, m) +{ + m.def("intel_device_info_device_id", &py_intel_device_id, + "Get ext_intel_device_id for the device, zero if not an intel device", + py::arg("device")); + + m.def("intel_device_info_gpu_eu_count", &py_intel_gpu_eu_count, + "Returns the number of execution units (EUs) associated with the " + "Intel GPU.", + py::arg("device")); + + m.def("intel_device_info_gpu_hw_threads_per_eu", + &py_intel_gpu_hw_threads_per_eu, + "Returns the number of hardware threads in EU.", py::arg("device")); + + m.def("intel_device_info_gpu_eu_simd_width", &py_intel_gpu_eu_simd_width, + "Returns the physical SIMD width of the execution unit (EU).", + py::arg("device")); + + m.def("intel_device_info_gpu_slices", &py_intel_gpu_slices, + "Returns the number of slices in the GPU device, or zero.", + py::arg("device")); + + m.def("intel_device_info_gpu_subslices_per_slice", + &py_intel_gpu_subslices_per_slice, + "Returns the number of subslices per slice.", py::arg("device")); + + m.def("intel_device_info_gpu_eu_count_per_subslice", + &py_intel_gpu_eu_count_per_subslice, + "Returns the number of EUs per subslice of GPU.", py::arg("device")); + + m.def("intel_device_info_max_mem_bandwidth", &py_intel_max_mem_bandwidth, + "Returns the maximum memory bandwidth in units of bytes/second.", + py::arg("device")); +} From d11e64ecddd804e8dfb46dec77c063bb9360811f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 4 Oct 2023 06:32:04 -0500 Subject: [PATCH 08/83] Add tests for new function in dpctl.utils Adds a test for _is_gen9 helper utility useful for skipping tests known to fail on Gen9. Adds a test for intel_device_info function. Test that descriptor names do not have typos. --- dpctl/tests/test_utils.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py index df4a9f503f..c9cbe6cda7 100644 --- a/dpctl/tests/test_utils.py +++ b/dpctl/tests/test_utils.py @@ -21,6 +21,7 @@ import dpctl import dpctl.utils +from dpctl.enum_types import backend_type def test_get_execution_queue_input_validation(): @@ -122,3 +123,38 @@ def test_onetrace_enabled(): with dpctl.utils.onetrace_enabled(): assert os.getenv(v_name, None) == "1" assert os.getenv(v_name, None) == v_v + + +def test__is_gen9(): + try: + d = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("Default device could not be created") + u = dpctl.utils._is_gen9(d) + assert isinstance(u, bool) + + +def test_intel_device_info(): + try: + d = dpctl.select_default_device() + except dpctl.SyclDeviceCreationError: + pytest.skip("Default device could not be created") + descr = dpctl.utils.intel_device_info(d) + assert isinstance(descr, dict) + assert ("device_id" in descr) or ( + not d.has_aspect_cpu and not d.backend == backend_type.level_zero + ) + allowed_names = [ + "device_id", + "gpu_slices", + "gpu_eu_count", + "gpu_eu_simd_width", + "gpu_hw_threads_per_eu", + "gpu_subslices_per_slice", + "gpu_eu_count_per_subslice", + "max_mem_bandwidth", + ] + for descriptor_name in descr.keys(): + test = descriptor_name in allowed_names + err_msg = f"Key '{descriptor_name}' is not recognized" + assert test, err_msg From ebf118aaddd433d7a387ad2d64e17121e2b494b4 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Fri, 6 Oct 2023 08:49:22 -0700 Subject: [PATCH 09/83] Repeat Python bindings properly pass host task dependencies (#1430) 1d variant of repeat was not passed host task event dependency for allocating shapes and strides on the device. This caused sporadic segfaults, where the kernel would attempt to access unallocated device data. --- dpctl/tensor/libtensor/source/repeat.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index 3b1c956dd4..391f995feb 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -253,11 +253,18 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, } sycl::event copy_shapes_strides_ev = std::get<2>(ptr_size_event_tuple1); + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shapes_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + repeat_ev = fn(exec_q, src_axis_nelems, src_data_p, dst_data_p, reps_data_p, cumsum_data_p, src_nd, packed_src_shape_strides, dst_shape_vec[0], dst_strides_vec[0], reps_shape_vec[0], - reps_strides_vec[0], depends); + reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -496,10 +503,10 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, assert(all_deps.size() == depends.size() + 1); - sycl::event repeat_ev = - fn(exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p, - src_nd, packed_src_shapes_strides, dst_shape_vec[0], - dst_strides_vec[0], reps_shape_vec[0], reps_strides_vec[0], depends); + sycl::event repeat_ev = fn( + exec_q, src_sz, src_data_p, dst_data_p, reps_data_p, cumsum_data_p, + src_nd, packed_src_shapes_strides, dst_shape_vec[0], dst_strides_vec[0], + reps_shape_vec[0], reps_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -652,7 +659,7 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, repeat_ev = fn(exec_q, dst_axis_nelems, src_data_p, dst_data_p, reps, src_nd, packed_src_shape_strides, dst_shape_vec[0], - dst_strides_vec[0], depends); + dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -856,7 +863,7 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, sycl::event repeat_ev = fn(exec_q, dst_sz, src_data_p, dst_data_p, reps, src_nd, packed_src_shape_strides, - dst_shape_vec[0], dst_strides_vec[0], depends); + dst_shape_vec[0], dst_strides_vec[0], all_deps); sycl::event cleanup_tmp_allocations_ev = exec_q.submit([&](sycl::handler &cgh) { From 0988dd04dc4ad140575e6f159a086f69cf2482f6 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Oct 2023 12:46:58 -0500 Subject: [PATCH 10/83] Skip dpt.prod testing for complex result dtype on Gen9 --- dpctl/tests/test_tensor_sum.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index dc647febf7..a1ecb5df24 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -17,6 +17,7 @@ import pytest import dpctl.tensor as dpt +import dpctl.utils as du from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported _all_dtypes = [ @@ -242,6 +243,16 @@ def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): skip_if_dtype_not_supported(arg_dtype, q) skip_if_dtype_not_supported(out_dtype, q) + out_dtype = dpt.dtype(out_dtype) + arg_dtype = dpt.dtype(arg_dtype) + if dpt.isdtype(out_dtype, "complex floating") and du._is_gen9( + q.sycl_device + ): + pytest.skip( + "Product reduction for complex output are known " + "to fail for Gen9 with 2024.0 compiler" + ) + m = dpt.ones(100, dtype=arg_dtype) r = dpt.prod(m, dtype=out_dtype) From 79a9d7848c82f6c1f7cfa68eb03bda27f067e87e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Oct 2023 16:34:39 -0500 Subject: [PATCH 11/83] Remove _is_gen9, export intel_device_info --- dpctl/utils/__init__.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/dpctl/utils/__init__.py b/dpctl/utils/__init__.py index b2d29721e6..fb41b3b74c 100644 --- a/dpctl/utils/__init__.py +++ b/dpctl/utils/__init__.py @@ -92,17 +92,11 @@ def intel_device_info(dev): return dict() -def _is_gen9(dev): - if not isinstance(dev, SyclDevice): - raise TypeError(f"Expected dpctl.SyclDevice, got {type(dev)}") - dev_id = intel_device_info_device_id(dev) - return (dev_id & 0xFF00) == 0x3E00 - - __all__ = [ "get_execution_queue", "get_coerced_usm_type", "validate_usm_type", "onetrace_enabled", + "intel_device_info", "ExecutionPlacementError", ] From 662bc4530e2e1623d3eb09c375f1e8d7a42d81d3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Oct 2023 16:35:05 -0500 Subject: [PATCH 12/83] Removed test for _is_gen9, as it was removed --- dpctl/tests/test_utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py index c9cbe6cda7..1aab7fd7e7 100644 --- a/dpctl/tests/test_utils.py +++ b/dpctl/tests/test_utils.py @@ -125,15 +125,6 @@ def test_onetrace_enabled(): assert os.getenv(v_name, None) == v_v -def test__is_gen9(): - try: - d = dpctl.select_default_device() - except dpctl.SyclDeviceCreationError: - pytest.skip("Default device could not be created") - u = dpctl.utils._is_gen9(d) - assert isinstance(u, bool) - - def test_intel_device_info(): try: d = dpctl.select_default_device() From cd0ec5d3208f26afc4a9c90783df1f3b7beea532 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Oct 2023 16:35:22 -0500 Subject: [PATCH 13/83] Skip prod tests for complex output types on Gen9 --- dpctl/tests/test_tensor_sum.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index a1ecb5df24..f6d1ca086b 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -188,11 +188,28 @@ def test_axis0_bug(): assert dpt.all(s == expected) +def _any_complex(dtypes): + return any(dpt.isdtype(dpt.dtype(dt), "complex floating") for dt in dtypes) + + +def _skip_on_this_device(sycl_dev): + device_mask = du.intel_device_info(sycl_dev).get("device_id", 0) & 0xFF00 + return device_mask in [0x3E00, 0x9B00] + + @pytest.mark.parametrize("arg_dtype", _all_dtypes[1:]) def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(arg_dtype, q) + arg_dtype = dpt.dtype(arg_dtype) + if _any_complex((arg_dtype,)): + if _skip_on_this_device(q.sycl_device): + pytest.skip( + "Product reduction for complex output are known " + "to fail for Gen9 with 2024.0 compiler" + ) + m = dpt.ones(100, dtype=arg_dtype) r = dpt.prod(m) @@ -245,13 +262,12 @@ def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): out_dtype = dpt.dtype(out_dtype) arg_dtype = dpt.dtype(arg_dtype) - if dpt.isdtype(out_dtype, "complex floating") and du._is_gen9( - q.sycl_device - ): - pytest.skip( - "Product reduction for complex output are known " - "to fail for Gen9 with 2024.0 compiler" - ) + if _any_complex((arg_dtype, out_dtype)): + if _skip_on_this_device(q.sycl_device): + pytest.skip( + "Product reduction for complex output are known " + "to fail for Gen9 with 2024.0 compiler" + ) m = dpt.ones(100, dtype=arg_dtype) r = dpt.prod(m, dtype=out_dtype) From a1b915a8cec93f7b023a2c92543ccdac044f1f67 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 11 Sep 2023 23:01:05 -0500 Subject: [PATCH 14/83] Swap use of deprecated IntelDPCPP with IntelSYCL cmake script Removed cmake/IntelDPCPP.cmake, vendored cmake/IntelSYCL.cmake script Changed project's CMake scripts to use IntelSYCL. Renamed libsyclinterface/cmake/modules/FindIntelSycl.cmake to libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake to avoid possible name conflict on OS with case insensitive FS --- CMakeLists.txt | 4 +- ...PCPPConfig.cmake => IntelSYCLConfig.cmake} | 142 ++++++++++++------ dpctl/CMakeLists.txt | 6 + dpctl/tensor/CMakeLists.txt | 7 +- libsyclinterface/CMakeLists.txt | 18 +-- ...Sycl.cmake => FindIntelSyclCompiler.cmake} | 90 +++++------ libsyclinterface/tests/CMakeLists.txt | 4 +- 7 files changed, 168 insertions(+), 103 deletions(-) rename cmake/{IntelDPCPPConfig.cmake => IntelSYCLConfig.cmake} (64%) mode change 100644 => 100755 rename libsyclinterface/cmake/modules/{FindIntelSycl.cmake => FindIntelSyclCompiler.cmake} (64%) diff --git a/CMakeLists.txt b/CMakeLists.txt index f506f1e2fe..a933e43b39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ option(DPCTL_GENERATE_COVERAGE OFF ) -find_package(IntelDPCPP REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH) add_subdirectory(libsyclinterface) @@ -40,6 +40,8 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(pybind11) +set(SKBUILD_LINK_LIBRARIES_KEYWORD PUBLIC) + add_subdirectory(dpctl) file(GLOB _cmake_scripts ${CMAKE_SOURCE_DIR}/cmake/*.cmake) diff --git a/cmake/IntelDPCPPConfig.cmake b/cmake/IntelSYCLConfig.cmake old mode 100644 new mode 100755 similarity index 64% rename from cmake/IntelDPCPPConfig.cmake rename to cmake/IntelSYCLConfig.cmake index 37d79a3ec1..5b5627bf46 --- a/cmake/IntelDPCPPConfig.cmake +++ b/cmake/IntelSYCLConfig.cmake @@ -1,5 +1,5 @@ # -# Modifications, Copyright (C) 2021 Intel Corporation +# Modifications, Copyright (C) 2022 Intel Corporation # # This software and the related documents are Intel copyrighted materials, and # your use of them is governed by the express license under which they were @@ -15,10 +15,10 @@ # file Copyright.txt or https://cmake.org/licensing for details. #[=======================================================================[.rst: -IntelDPCPPConfig +IntelSYCLConfig ------- -DPCPP Library to verify DPCPP/SYCL compatability of CMAKE_CXX_COMPILER +Library to verify SYCL compatability of CMAKE_CXX_COMPILER and passes relevant compiler flags. Result Variables @@ -26,8 +26,8 @@ Result Variables This will define the following variables: -``IntelDPCPP_FOUND`` - True if the system has the DPCPP library. +``IntelSYCL_FOUND`` + True if the system has the SYCL library. ``SYCL_LANGUAGE_VERSION`` The SYCL language spec version by Compiler. ``SYCL_INCLUDE_DIR`` @@ -37,35 +37,39 @@ This will define the following variables: ``SYCL_FLAGS`` SYCL specific flags for the compiler. +``IntelSYCL::SYCL_CXX`` + Target for using Intel SYCL (DPC++). The following properties are defined + for the target: ``INTERFACE_COMPILE_OPTIONS``, ``INTERFACE_LINK_OPTIONS``, + ``INTERFACE_INCLUDE_DIRECTORIES``, and ``INTERFACE_LINK_DIRECTORIES`` + Cache Variables ^^^^^^^^^^^^^^^ -The following cache variables may also be set: +The following cache variable may also be set: -``SYCL_INCLUDE_DIR`` - The directory containing ``sycl.hpp``. -``SYCL_LIBRARY_DIR`` - The path to the SYCL library. -``SYCL_FLAGS`` - SYCL specific flags for the compiler. ``SYCL_LANGUAGE_VERSION`` The SYCL language spec version by Compiler. -.. note:: +.. Note:: - For now, user needs to set -DCMAKE_CXX_COMPILER or environment of + 1. User needs to set -DCMAKE_CXX_COMPILER or environment of CXX pointing to SYCL compatible compiler ( eg: icx, clang++, icpx) - Note: do not set to DPCPP compiler. If set to a Compiler family - that supports dpcpp ( eg: IntelLLVM) both DPCPP and SYCL - features are enabled. - And add this package to user's Cmake config file. + 2. Add this package to user's Cmake config file. + + .. code-block:: cmake + + find_package(IntelSYCL REQUIRED) + + 3. Add sources to target through add_sycl_to_target() .. code-block:: cmake - find_package(IntelDPCPP REQUIRED) + # Compile specific sources for SYCL and build target for SYCL + add_executable(target_proj A.cpp B.cpp offload1.cpp offload2.cpp) + add_sycl_to_target(TARGET target_proj SOURCES offload1.cpp offload2.cpp) #]=======================================================================] @@ -83,25 +87,33 @@ endif() string(COMPARE EQUAL "${CMAKE_CXX_COMPILER}" "" nocmplr) if(nocmplr) - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "SYCL: CMAKE_CXX_COMPILER not set!!") - set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") +endif() + +# Check if a Compiler ID is being set. project() should be set prior to find_package() + +if("x${CMAKE_CXX_COMPILER_ID}" STREQUAL "x") + set(IntelSYCL_FOUND False) + set(SYCL_REASON_FAILURE "CMake CXX Compiler family is not set. Please make sure find_package(IntelSYCL) is called after project()!!") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + return() endif() # Check for known compiler family that supports SYCL if( NOT "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" AND NOT "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM") - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "Unsupported compiler family ${CMAKE_CXX_COMPILER_ID} and compiler ${CMAKE_CXX_COMPILER}!!") - set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") return() endif() # Assume that CXX Compiler supports SYCL and then test to verify. set(SYCL_COMPILER ${CMAKE_CXX_COMPILER}) - # Function to write a test case to verify SYCL features. function(SYCL_FEATURE_TEST_WRITE src) @@ -144,7 +156,7 @@ function(SYCL_FEATURE_TEST_BUILD TEST_SRC_FILE TEST_EXE) OUTPUT_VARIABLE output ERROR_VARIABLE output OUTPUT_FILE ${SYCL_TEST_DIR}/Compile.log RESULT_VARIABLE result - TIMEOUT 20 + TIMEOUT 60 ) # Verify if test case build properly. @@ -168,12 +180,12 @@ function(SYCL_FEATURE_TEST_RUN TEST_EXE) WORKING_DIRECTORY ${SYCL_TEST_DIR} OUTPUT_VARIABLE output ERROR_VARIABLE output RESULT_VARIABLE result - TIMEOUT 20 + TIMEOUT 60 ) # Verify the test execution output. if(test_result) - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "SYCL: feature test execution failed!!") endif() # TODO: what iff the result is false.. error or ignore? @@ -236,14 +248,14 @@ set(SYCL_LINK_FLAGS "") # Based on Compiler ID, add support for SYCL if( "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xClang" OR "x${CMAKE_CXX_COMPILER_ID}" STREQUAL "xIntelLLVM") - set(SYCL_FLAGS "-fsycl ") - set(SYCL_LINK_FLAGS "-fsycl ") + list(APPEND SYCL_FLAGS "-fsycl") + list(APPEND SYCL_LINK_FLAGS "-fsycl") endif() # TODO verify if this is needed # Windows: Add Exception handling if(WIN32) - set(SYCL_FLAGS "${SYCL_FLAGS} /EHsc") + list(APPEND SYCL_FLAGS "/EHsc") endif() set(SYCL_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}") @@ -273,32 +285,76 @@ SYCL_FEATURE_TEST_EXTRACT(${test_output}) # define macro SYCL_LANGUAGE_VERSION string(COMPARE EQUAL "${SYCL_LANGUAGE_VERSION}" "" nosycllang) if(nosycllang) - set(IntelDPCPP_FOUND False) + set(IntelSYCL_FOUND False) set(SYCL_REASON_FAILURE "SYCL: It appears that the ${CMAKE_CXX_COMPILER} does not support SYCL") - set(IntelDPCPP_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") + set(IntelSYCL_NOT_FOUND_MESSAGE "${SYCL_REASON_FAILURE}") endif() # Placeholder for identifying various implemenations of SYCL compilers. # for now, set to the CMAKE_CXX_COMPILER_ID set(SYCL_IMPLEMENTATION_ID "${CMAKE_CXX_COMPILER_ID}") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SYCL_FLAGS}") -set(CMAKE_CXX_LINK_FLAGS "${CMAKE_CXX_LINK_FLAGS} ${SYCL_LINK_FLAGS}") +message(DEBUG "The SYCL compiler is ${SYCL_COMPILER}") +message(DEBUG "The SYCL Flags are ${SYCL_FLAGS}") +message(DEBUG "The SYCL Language Version is ${SYCL_LANGUAGE_VERSION}") -message(STATUS "Echo from ${CMAKE_CURRENT_SOURCE_DIR}/IntelDPCPPConfig.cmake") -message(STATUS "The SYCL compiler is ${SYCL_COMPILER}") -message(STATUS "The SYCL Flags are ${SYCL_FLAGS}") -message(STATUS "The SYCL Language Version is ${SYCL_LANGUAGE_VERSION}") +add_library(IntelSYCL::SYCL_CXX INTERFACE IMPORTED) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_COMPILE_OPTIONS ${SYCL_FLAGS}) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_LINK_OPTIONS ${SYCL_LINK_FLAGS}) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_INCLUDE_DIRECTORIES ${SYCL_INCLUDE_DIR}) +set_property(TARGET IntelSYCL::SYCL_CXX PROPERTY + INTERFACE_LINK_DIRECTORIES ${SYCL_LIBRARY_DIR}) find_package_handle_standard_args( - IntelDPCPP - FOUND_VAR IntelDPCPP_FOUND + IntelSYCL + FOUND_VAR IntelSYCL_FOUND REQUIRED_VARS SYCL_INCLUDE_DIR SYCL_LIBRARY_DIR SYCL_FLAGS VERSION_VAR SYCL_LANGUAGE_VERSION REASON_FAILURE_MESSAGE "${SYCL_REASON_FAILURE}") # Include in Cache set(SYCL_LANGUAGE_VERSION "${SYCL_LANGUAGE_VERSION}" CACHE STRING "SYCL Language version") -set(SYCL_INCLUDE_DIR "${SYCL_INCLUDE_DIR}" CACHE FILEPATH "SYCL Include directory") -set(SYCL_LIBRARY_DIR "${SYCL_LIBRARY_DIR}" CACHE FILEPATH "SYCL Library Directory") -set(SYCL_FLAGS "${SYCL_FLAGS}" CACHE STRING "SYCL flags for the compiler") + +function(add_sycl_to_target) + + set(one_value_args TARGET) + set(multi_value_args SOURCES) + cmake_parse_arguments(SYCL + "" + "${one_value_args}" + "${multi_value_args}" + ${ARGN}) + + + get_target_property(__sycl_cxx_options IntelSYCL::SYCL_CXX INTERFACE_COMPILE_OPTIONS) + get_target_property(__sycl_cxx_include_directories IntelSYCL::SYCL_CXX INTERFACE_INCLUDE_DIRECTORIES) + + if(NOT ${ARGC}) + message(FATAL_ERROR " add_sycl_to_target() does not have any arguments") + elseif(${ARGC} EQUAL 1) + message(WARNING "add_sycl_to_target() have only one argument specified.. assuming the target to be ${ARGV}. +Adding sycl to all sources but that may effect compilation times") + set(SYCL_TARGET ${ARGV}) + endif() + + if(NOT SYCL_SOURCES) + message(WARNING "add_sycl_to_target() does not have sources specified.. Adding sycl to all sources but that may effect compilation times") + target_compile_options(${SYCL_TARGET} PUBLIC ${__sycl_cxx_options}) + target_include_directories(${SYCL_TARGET} PUBLIC ${__sycl_cxx_include_directories}) + endif() + + foreach(source ${SYCL_SOURCES}) + set_source_files_properties(${source} PROPERTIES COMPILE_OPTIONS "${__sycl_cxx_options}") + set_source_files_properties(${source} PROPERTIES INCLUDE_DIRECTORIES "${__sycl_cxx_include_directories}") + endforeach() + + get_target_property(__sycl_link_options + IntelSYCL::SYCL_CXX INTERFACE_LINK_OPTIONS) + target_link_options(${SYCL_TARGET} PUBLIC "${__sycl_link_options}") + get_target_property(__sycl_link_directories + IntelSYCL::SYCL_CXX INTERFACE_LINK_DIRECTORIES) + target_link_directories(${SYCL_TARGET} PUBLIC "${__sycl_link_directories}") +endfunction(add_sycl_to_target) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index 795ca09c78..f0abed2079 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -137,10 +137,15 @@ add_custom_target( set(CMAKE_INSTALL_RPATH "$ORIGIN") function(build_dpctl_ext _trgt _src _dest) + set(options SYCL) + cmake_parse_arguments(BUILD_DPCTL_EXT "${options}" "" "" ${ARGN}) add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) set(_cythonize_trgt "${_trgt}_cythonize_pyx") add_custom_target(${_cythonize_trgt} DEPENDS ${_src}) python_add_library(${_trgt} MODULE ${_generated_src}) + if (BUILD_DPCTL_EXT_SYCL) + add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) + endif() target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR}) add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt}) if (DPCTL_GENERATE_COVERAGE) @@ -193,6 +198,7 @@ endforeach() # _sycl_queue include _host_task_util.hpp target_include_directories(_sycl_queue PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +add_sycl_to_target(TARGET _sycl_queue SOURCES _sycl_queue.cxx) add_subdirectory(program) add_subdirectory(memory) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 9a2493421e..c752abace1 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -30,8 +30,7 @@ if(WIN32) endif() endif() -set(python_module_name _tensor_impl) -pybind11_add_module(${python_module_name} MODULE +set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp @@ -52,6 +51,10 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ) + +set(python_module_name _tensor_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") diff --git a/libsyclinterface/CMakeLists.txt b/libsyclinterface/CMakeLists.txt index 01b0321064..64ec3271b1 100644 --- a/libsyclinterface/CMakeLists.txt +++ b/libsyclinterface/CMakeLists.txt @@ -11,8 +11,8 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake/mo find_package(Git REQUIRED) -if(NOT DEFINED IntelDPCPP_FOUND OR NOT IntelDPCPP_FOUND) - find_package(IntelDPCPP REQUIRED) +if(NOT DEFINED IntelSYCL_FOUND OR NOT IntelSYCL_FOUND) + find_package(IntelSYCL REQUIRED) endif() # Option to turn on support for creating Level Zero interoperability programs @@ -43,11 +43,10 @@ option(DPCTL_ENABLE_GLOG ) # Minimum version requirement only when oneAPI dpcpp is used. -find_package(IntelDPCPP REQUIRED) if(DPCTL_DPCPP_FROM_ONEAPI) - find_package(IntelSycl 2021.3.0 REQUIRED) + find_package(IntelSyclCompiler 2021.3.0 REQUIRED) else() - find_package(IntelSycl REQUIRED) + find_package(IntelSyclCompiler REQUIRED) endif() if(DPCTL_ENABLE_L0_PROGRAM_CREATION) @@ -57,7 +56,7 @@ if(DPCTL_ENABLE_L0_PROGRAM_CREATION) if (UNIX) find_library(PI_LEVEL_ZERO_LIB NAMES pi_level_zero - HINTS ${IntelSycl_LIBRARY_DIR} + HINTS ${IntelSyclCompiler_LIBRARY_DIR} ) find_program(READELF_PROG readelf) find_program(GREP_PROG grep) @@ -77,7 +76,7 @@ endif() if (UNIX) find_library(PI_OPENCL_LIB NAMES pi_opencl - HINTS ${IntelSycl_LIBRARY_DIR} + HINTS ${IntelSyclCompiler_LIBRARY_DIR} ) find_program(READELF_PROG readelf) find_program(GREP_PROG grep) @@ -157,7 +156,6 @@ elseif(UNIX) string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}" - "-fsycl " ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXXFLAGS}") @@ -206,6 +204,7 @@ add_library(DPCTLSyclInterface ${sources} ${helper_sources} ) +add_sycl_to_target(TARGET DPCTLSyclInterface SOURCES ${sources} ${helper_sources}) if(DPCTL_GENERATE_COVERAGE) target_link_options(DPCTLSyclInterface @@ -222,9 +221,6 @@ target_include_directories(DPCTLSyclInterface ${CMAKE_CURRENT_SOURCE_DIR}/helper/include/ ${SYCL_INCLUDE_DIR} ) -target_link_libraries(DPCTLSyclInterface - PRIVATE ${IntelSycl_SYCL_LIBRARY} -) if(DPCTL_ENABLE_GLOG) find_package(glog REQUIRED) diff --git a/libsyclinterface/cmake/modules/FindIntelSycl.cmake b/libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake similarity index 64% rename from libsyclinterface/cmake/modules/FindIntelSycl.cmake rename to libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake index 84e8946fea..45bb4f583f 100644 --- a/libsyclinterface/cmake/modules/FindIntelSycl.cmake +++ b/libsyclinterface/cmake/modules/FindIntelSyclCompiler.cmake @@ -19,21 +19,23 @@ # # Example usage: # -# find_package(IntelSycl) +# find_package(IntelSyclCompiler) # # If successful, the following variables will be defined: -# IntelSycl_FOUND -# IntelSycl_VERSION -# IntelSycl_INCLUDE_DIR -# IntelSycl_C_COMPILER -# IntelSycl_CXX_COMPILER -# IntelSycl_SYCL_INCLUDE_DIR -# IntelSycl_LIBRARY_DIR -# IntelSycl_SYCL_LIBRARY -# IntelSycl_OPENCL_LIBRARY +# IntelSyclCompiler_FOUND +# IntelSyclCompiler_VERSION +# IntelSyclCompiler_INCLUDE_DIR +# IntelSyclCompiler_C_COMPILER +# IntelSyclCompiler_CXX_COMPILER +# IntelSyclCompiler_SYCL_INCLUDE_DIR +# IntelSyclCompiler_LIBRARY_DIR +# IntelSyclCompiler_SYCL_LIBRARY +# IntelSyclCompiler_OPENCL_LIBRARY include(FindPackageHandleStandardArgs) -find_package(IntelDPCPP REQUIRED) +if(NOT DEFINED IntelSYCL_FOUND OR NOT IntelSYCL_FOUND) + find_package(IntelSYCL REQUIRED) +endif() # We will extract the version information from the compiler set(clangxx_cmd "${CMAKE_CXX_COMPILER}") @@ -91,78 +93,78 @@ execute_process( # If dpcpp is found then set the package variables if(${clangxx_result} MATCHES "0") - string(REPLACE "\n" ";" IntelSycl_VERSION_LIST "${clangxx_ver}") + string(REPLACE "\n" ";" IntelSyclCompiler_VERSION_LIST "${clangxx_ver}") set(IDX 0) - foreach(X ${IntelSycl_VERSION_LIST}) + foreach(X ${IntelSyclCompiler_VERSION_LIST}) message(STATUS "dpcpp ver[${IDX}]: ${X}") MATH(EXPR IDX "${IDX}+1") endforeach() - list(GET IntelSycl_VERSION_LIST 0 VERSION_STRING) + list(GET IntelSyclCompiler_VERSION_LIST 0 VERSION_STRING) # Get the dpcpp version string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+" - IntelSycl_VERSION + IntelSyclCompiler_VERSION ${VERSION_STRING} ) # Split out the version into major, minor an patch - string(REPLACE "." ";" IntelSycl_VERSION_LIST1 "${IntelSycl_VERSION}") - list(GET IntelSycl_VERSION_LIST1 0 IntelSycl_VERSION_MAJOR) - list(GET IntelSycl_VERSION_LIST1 1 IntelSycl_VERSION_MINOR) - list(GET IntelSycl_VERSION_LIST1 2 IntelSycl_VERSION_PATCH) - set(IntelSycl_INCLUDE_DIR ${SYCL_INCLUDE_DIR}) - set(IntelSycl_SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR}/sycl) - set(IntelSycl_LIBRARY_DIR ${SYCL_LIBRARY_DIR}) + string(REPLACE "." ";" IntelSyclCompiler_VERSION_LIST1 "${IntelSyclCompiler_VERSION}") + list(GET IntelSyclCompiler_VERSION_LIST1 0 IntelSyclCompiler_VERSION_MAJOR) + list(GET IntelSyclCompiler_VERSION_LIST1 1 IntelSyclCompiler_VERSION_MINOR) + list(GET IntelSyclCompiler_VERSION_LIST1 2 IntelSyclCompiler_VERSION_PATCH) + set(IntelSyclCompiler_INCLUDE_DIR ${SYCL_INCLUDE_DIR}) + set(IntelSyclCompiler_SYCL_INCLUDE_DIR ${SYCL_INCLUDE_DIR}/sycl) + set(IntelSyclCompiler_LIBRARY_DIR ${SYCL_LIBRARY_DIR}) if("x${CMAKE_SYSTEM_NAME}" STREQUAL "xWindows") find_file( - IntelSycl_SYCL_LIBRARY + IntelSyclCompiler_SYCL_LIBRARY NAMES "sycl.lib" "sycl6.lib" "sycl7.lib" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) find_file( - IntelSycl_OPENCL_LIBRARY + IntelSyclCompiler_OPENCL_LIBRARY NAMES "OpenCL.lib" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) elseif("x${CMAKE_SYSTEM_NAME}" STREQUAL "xLinux") find_file( - IntelSycl_SYCL_LIBRARY + IntelSyclCompiler_SYCL_LIBRARY NAMES "libsycl.so" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) find_file( - IntelSycl_OPENCL_LIBRARY + IntelSyclCompiler_OPENCL_LIBRARY NAMES "libOpenCL.so" - PATHS ${IntelSycl_LIBRARY_DIR} + PATHS ${IntelSyclCompiler_LIBRARY_DIR} ) endif() endif() # Check if a specific version of DPCPP is requested. -if(IntelSycl_FIND_VERSION AND (DEFINED IntelSycl_VERSION)) +if(IntelSyclCompiler_FIND_VERSION AND (DEFINED IntelSyclCompiler_VERSION)) set(VERSION_GT_FIND_VERSION FALSE) versions_greater_equal( - ${IntelSycl_VERSION} - ${IntelSycl_FIND_VERSION} + ${IntelSyclCompiler_VERSION} + ${IntelSyclCompiler_FIND_VERSION} VERSION_GT_FIND_VERSION ) if(VERSION_GT_FIND_VERSION) - set(IntelSycl_FOUND TRUE) + set(IntelSyclCompiler_FOUND TRUE) else() - set(IntelSycl_FOUND FALSE) + set(IntelSyclCompiler_FOUND FALSE) endif() else() - set(IntelSycl_FOUND TRUE) + set(IntelSyclCompiler_FOUND TRUE) endif() -find_package_handle_standard_args(IntelSycl DEFAULT_MSG - IntelSycl_FOUND - IntelSycl_VERSION - IntelSycl_INCLUDE_DIR - IntelSycl_SYCL_INCLUDE_DIR - IntelSycl_LIBRARY_DIR - IntelSycl_SYCL_LIBRARY - IntelSycl_OPENCL_LIBRARY +find_package_handle_standard_args(IntelSyclCompiler DEFAULT_MSG + IntelSyclCompiler_FOUND + IntelSyclCompiler_VERSION + IntelSyclCompiler_INCLUDE_DIR + IntelSyclCompiler_SYCL_INCLUDE_DIR + IntelSyclCompiler_LIBRARY_DIR + IntelSyclCompiler_SYCL_LIBRARY + IntelSyclCompiler_OPENCL_LIBRARY ) diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt index 4cfd30338d..7d324ed81a 100644 --- a/libsyclinterface/tests/CMakeLists.txt +++ b/libsyclinterface/tests/CMakeLists.txt @@ -40,7 +40,7 @@ if(DPCTL_GENERATE_COVERAGE) ${CMAKE_THREAD_LIBS_INIT} GTest::GTest DPCTLSyclInterface - ${IntelSycl_OPENCL_LIBRARY} + ${IntelSyclCompiler_OPENCL_LIBRARY} ${CMAKE_DL_LIBS} ) set(object_arg "-object;") @@ -96,7 +96,7 @@ else() ${CMAKE_THREAD_LIBS_INIT} GTest::GTest DPCTLSyclInterface - ${IntelSycl_OPENCL_LIBRARY} + ${IntelSyclCompiler_OPENCL_LIBRARY} ) endif() From 5d5dcc2ee59dd33cbba1ffc92bb5bf35763a4643 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 11 Sep 2023 23:03:06 -0500 Subject: [PATCH 15/83] Use IntelSYCL instead of IntelDPCPP cmake script --- examples/pybind11/external_usm_allocation/CMakeLists.txt | 8 ++++++-- examples/pybind11/onemkl_gemv/CMakeLists.txt | 6 ++++-- examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt | 6 ++++-- examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt | 6 ++++-- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt index db8c6c9aa6..43f2c05a78 100644 --- a/examples/pybind11/external_usm_allocation/CMakeLists.txt +++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt @@ -5,7 +5,7 @@ project(external_usm_allocation VERSION 0.1 LANGUAGES CXX set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCK REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -23,10 +23,14 @@ find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) set(py_module_name _external_usm_alloc) +set(_sources + external_usm_allocation/_usm_alloc_example.cpp +) pybind11_add_module(${py_module_name} MODULE - external_usm_allocation/_usm_alloc_example.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) install(TARGETS ${py_module_name} DESTINATION external_usm_allocation diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt index c2ac5fc516..67e55c895c 100644 --- a/examples/pybind11/onemkl_gemv/CMakeLists.txt +++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt @@ -4,7 +4,7 @@ project(example_use_mkl_gemm VERSION 0.1 LANGUAGES CXX DESCRIPTION "Example of using Python wrapper to oneMKL function") set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}") @@ -35,10 +35,12 @@ find_library(OpenCL NAMES OpenCL REQUIRED) set(py_module_name _onemkl) +set(_sources sycl_gemm/_onemkl.cpp) pybind11_add_module(${py_module_name} MODULE - sycl_gemm/_onemkl.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_compile_definitions(${py_module_name} PRIVATE -DMKL_ILP64) target_include_directories(${py_module_name} PUBLIC ${MKL_INCLUDE_DIR} sycl_gemm diff --git a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt index 32770aa750..ec33b2e153 100644 --- a/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt +++ b/examples/pybind11/use_dpctl_sycl_kernel/CMakeLists.txt @@ -5,7 +5,7 @@ project(use_queue_device VERSION 0.1 LANGUAGES CXX set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -24,10 +24,12 @@ find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) set(py_module_name _use_kernel) +set(_sources use_kernel/_example.cpp) pybind11_add_module(${py_module_name} MODULE - use_kernel/_example.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) install(TARGETS ${py_module_name} DESTINATION use_kernel diff --git a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt index 4ee47e71a9..827388fae1 100644 --- a/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt +++ b/examples/pybind11/use_dpctl_sycl_queue/CMakeLists.txt @@ -5,7 +5,7 @@ project(use_queue_device VERSION 0.1 LANGUAGES CXX set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelDPCPP REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) @@ -23,10 +23,12 @@ find_package(Python REQUIRED COMPONENTS Development.Module NumPy) find_package(Dpctl REQUIRED) set(py_module_name _use_queue_device) +set(_sources use_queue_device/_example.cpp) pybind11_add_module(${py_module_name} MODULE - use_queue_device/_example.cpp + ${_sources} ) +add_sycl_to_target(TARGET ${py_module_name} SOURCES ${_sources}) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) install(TARGETS ${py_module_name} DESTINATION use_queue_device From de644b928f457bbbca560167a62dc71a11ceb4ce Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 12 Sep 2023 00:44:33 -0500 Subject: [PATCH 16/83] libsyclinterface/tests/test_helper.cpp needs SYCL --- libsyclinterface/tests/CMakeLists.txt | 43 +++++++++++++++++++-------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt index 7d324ed81a..8fcc5a20d0 100644 --- a/libsyclinterface/tests/CMakeLists.txt +++ b/libsyclinterface/tests/CMakeLists.txt @@ -21,17 +21,38 @@ foreach(tf ${spirv-test-files}) file(COPY ${tf} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) endforeach() -if(DPCTL_GENERATE_COVERAGE) - file(GLOB_RECURSE - sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp - ) +file(GLOB_RECURSE + sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp +) - # Add all dpctl sources into a single executable so that we can run coverage - # analysis and generate a report. - add_executable(dpctl_c_api_tests - EXCLUDE_FROM_ALL - ${sources} - ) +# Add all dpctl sources into a single executable so that we can run coverage +# analysis and generate a report. +add_executable(dpctl_c_api_tests + EXCLUDE_FROM_ALL + ${sources} +) +add_sycl_to_target( + TARGET dpctl_c_api_tests + SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/test_helper.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_manager.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_context_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_invalid_filters.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_aspects.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_event_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_bundle_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_invalid_filters.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_subdevices.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_selector_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_interface.cpp +) + +if(DPCTL_GENERATE_COVERAGE) target_include_directories(dpctl_c_api_tests PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../helper/include" PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../include" @@ -90,8 +111,6 @@ if(DPCTL_GENERATE_COVERAGE) DEPENDS dpctl_c_api_tests ) else() - file(GLOB_RECURSE sources ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) - add_executable(dpctl_c_api_tests EXCLUDE_FROM_ALL ${sources}) target_link_libraries(dpctl_c_api_tests ${CMAKE_THREAD_LIBS_INIT} GTest::GTest From ce5e7c822f96e52e70723570ce37a4964388b8d4 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 12 Sep 2023 03:56:55 -0500 Subject: [PATCH 17/83] Make sure to not overwrite -fsycl option in select files Removed -fsycl for default linking options on Windows. The option is added as needed. --- dpctl/CMakeLists.txt | 10 ++++++---- dpctl/tensor/CMakeLists.txt | 14 +++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index f0abed2079..7e2a25280a 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -58,7 +58,6 @@ elseif(UNIX) string(CONCAT CXXFLAGS "${WARNING_FLAGS}" "${SDL_FLAGS}" - "-fsycl " ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 ${CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 ${CXXFLAGS}") @@ -190,15 +189,18 @@ function(build_dpctl_ext _trgt _src _dest) install(TARGETS ${_trgt} LIBRARY DESTINATION ${_dest}) endfunction() -file(GLOB _cython_sources *.pyx) +file(GLOB _cython_sources ${CMAKE_CURRENT_SOURCE_DIR}/*.pyx) +list(REMOVE_ITEM _cython_sources ${CMAKE_CURRENT_SOURCE_DIR}/_sycl_queue.pyx) foreach(_cy_file ${_cython_sources}) get_filename_component(_trgt ${_cy_file} NAME_WLE) build_dpctl_ext(${_trgt} ${_cy_file} "dpctl") endforeach() +set(_cy_file ${CMAKE_CURRENT_SOURCE_DIR}/_sycl_queue.pyx) +get_filename_component(_trgt ${_cy_file} NAME_WLE) +build_dpctl_ext(${_trgt} ${_cy_file} "dpctl" SYCL) # _sycl_queue include _host_task_util.hpp -target_include_directories(_sycl_queue PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -add_sycl_to_target(TARGET _sycl_queue SOURCES _sycl_queue.cxx) +target_include_directories(${_trgt} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) add_subdirectory(program) add_subdirectory(memory) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index c752abace1..f9cdc83e92 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -59,12 +59,20 @@ set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") endif() -set_source_files_properties( + +set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp - PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math") +) +foreach(_src_fn ${_no_fast_math_sources}) + get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) + set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") + set_source_files_properties( + ${_src_fn} + PROPERTIES COMPILE_OPTIONS "${_combined_options_prop}" + ) +endforeach() if (UNIX) set_source_files_properties( ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp From 5898fdd4b388576dd1b7266c129359acdbf759c0 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 12 Sep 2023 05:13:03 -0500 Subject: [PATCH 18/83] IntelSycl_VERSION -> IntelSyclCompiler_VERSION --- libsyclinterface/include/Config/dpctl_config.h.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libsyclinterface/include/Config/dpctl_config.h.in b/libsyclinterface/include/Config/dpctl_config.h.in index f26fc5591b..6e3daffbed 100644 --- a/libsyclinterface/include/Config/dpctl_config.h.in +++ b/libsyclinterface/include/Config/dpctl_config.h.in @@ -31,7 +31,7 @@ #define __SYCL_COMPILER_VERSION_REQUIRED 20221201L /* The DPCPP version used to build dpctl */ -#define DPCTL_DPCPP_VERSION "@IntelSycl_VERSION@" +#define DPCTL_DPCPP_VERSION "@IntelSyclCompiler_VERSION@" #define DPCTL_LIBZE_LOADER_FILENAME "@LIBZE_LOADER_FILENAME@" #define DPCTL_LIBCL_LOADER_FILENAME "@LIBCL_LOADER_FILENAME@" From 72e44ba09a271360fef8e3537ee1eaa815246b90 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 12 Sep 2023 05:23:36 -0500 Subject: [PATCH 19/83] Added additional test file needed sycl --- libsyclinterface/tests/CMakeLists.txt | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt index 8fcc5a20d0..472e1787fa 100644 --- a/libsyclinterface/tests/CMakeLists.txt +++ b/libsyclinterface/tests/CMakeLists.txt @@ -35,21 +35,22 @@ add_sycl_to_target( TARGET dpctl_c_api_tests SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_helper.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_manager.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_context_interface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_invalid_filters.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_aspects.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_event_interface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_interface.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_bundle_interface.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_invalid_filters.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_subdevices.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_manager.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_selector_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_device_aspects.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_event_interface.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_kernel_bundle_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_platform_invalid_filters.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_manager.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_submit.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp ) if(DPCTL_GENERATE_COVERAGE) From 679fdc971c98901a0849350eb776387681f85209 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 12 Sep 2023 07:34:31 -0500 Subject: [PATCH 20/83] Try to make link options add by add_sycl_to_target PRIVATE --- cmake/IntelSYCLConfig.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/IntelSYCLConfig.cmake b/cmake/IntelSYCLConfig.cmake index 5b5627bf46..c51e47290c 100755 --- a/cmake/IntelSYCLConfig.cmake +++ b/cmake/IntelSYCLConfig.cmake @@ -353,7 +353,7 @@ Adding sycl to all sources but that may effect compilation times") get_target_property(__sycl_link_options IntelSYCL::SYCL_CXX INTERFACE_LINK_OPTIONS) - target_link_options(${SYCL_TARGET} PUBLIC "${__sycl_link_options}") + target_link_options(${SYCL_TARGET} PRIVATE "${__sycl_link_options}") get_target_property(__sycl_link_directories IntelSYCL::SYCL_CXX INTERFACE_LINK_DIRECTORIES) target_link_directories(${SYCL_TARGET} PUBLIC "${__sycl_link_directories}") From c6a930508b533ff44db5512bc7eec8fdf4ba1917 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 13 Sep 2023 06:55:56 -0500 Subject: [PATCH 21/83] Do not override -fsycl --- examples/pybind11/onemkl_gemv/CMakeLists.txt | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/pybind11/onemkl_gemv/CMakeLists.txt b/examples/pybind11/onemkl_gemv/CMakeLists.txt index 67e55c895c..eb70b22982 100644 --- a/examples/pybind11/onemkl_gemv/CMakeLists.txt +++ b/examples/pybind11/onemkl_gemv/CMakeLists.txt @@ -52,11 +52,14 @@ target_link_libraries(${py_module_name} install(TARGETS ${py_module_name} DESTINATION sycl_gemm) target_include_directories(${py_module_name} PUBLIC ${Dpctl_INCLUDE_DIRS}) -get_target_property(_sycl_gemm_sources ${py_module_name} SOURCES) -set_source_files_properties(${_sycl_gemm_sources} - PROPERTIES - COMPILE_OPTIONS "-O3" -) +foreach(_src_fn ${_sources}) + get_source_file_property(_compile_options ${_src_fn} COMPILE_OPTIONS) + set(_combined_options ${_compile_options} "-O3") + set_source_files_properties(${_src_fn} + PROPERTIES + COMPILE_OPTIONS "${_combined_options}" + ) +endforeach() target_link_options(${py_module_name} PRIVATE -fsycl-device-code-split=per_kernel) add_executable(standalone_cpp From 5adfe0ea72f005f2f51a10a5c2a09122b4174b29 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 13 Sep 2023 09:19:59 -0500 Subject: [PATCH 22/83] Fixed typo in examples/pybind11/external_usm_allocation/CMakeLists.txt --- examples/pybind11/external_usm_allocation/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pybind11/external_usm_allocation/CMakeLists.txt b/examples/pybind11/external_usm_allocation/CMakeLists.txt index 43f2c05a78..c8679ab73a 100644 --- a/examples/pybind11/external_usm_allocation/CMakeLists.txt +++ b/examples/pybind11/external_usm_allocation/CMakeLists.txt @@ -5,7 +5,7 @@ project(external_usm_allocation VERSION 0.1 LANGUAGES CXX set(DPCTL_CMAKE_MODULES_PATH "${CMAKE_SOURCE_DIR}/../../../cmake") set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${DPCTL_CMAKE_MODULES_PATH}) -find_package(IntelSYCK REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) +find_package(IntelSYCL REQUIRED PATHS ${DPCTL_CMAKE_MODULES_PATH} NO_DEFAULT_PATH) set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) From b60bc90947d9795dfcb9214e8697b165b09110d9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sat, 23 Sep 2023 22:14:34 -0500 Subject: [PATCH 23/83] Removed setting SKBUILD_LINK_LIBRARIES_KEYWORD since we move to cmake's pyhon_add_library --- CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a933e43b39..eb53db12ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,8 +40,6 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(pybind11) -set(SKBUILD_LINK_LIBRARIES_KEYWORD PUBLIC) - add_subdirectory(dpctl) file(GLOB _cmake_scripts ${CMAKE_SOURCE_DIR}/cmake/*.cmake) From ad1f120aba1fac3de7c60fd8b8e2e07ba169c2e2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Oct 2023 11:02:16 -0500 Subject: [PATCH 24/83] Use python_add_library with WITH_SOABI --- dpctl/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt index 7e2a25280a..cb872ff45f 100644 --- a/dpctl/CMakeLists.txt +++ b/dpctl/CMakeLists.txt @@ -141,7 +141,7 @@ function(build_dpctl_ext _trgt _src _dest) add_cython_target(${_trgt} ${_src} CXX OUTPUT_VAR _generated_src) set(_cythonize_trgt "${_trgt}_cythonize_pyx") add_custom_target(${_cythonize_trgt} DEPENDS ${_src}) - python_add_library(${_trgt} MODULE ${_generated_src}) + Python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src}) if (BUILD_DPCTL_EXT_SYCL) add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src}) endif() From 9f078d9e95dfb3c3c107c5198759100849444130 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 6 Oct 2023 16:25:22 -0500 Subject: [PATCH 25/83] Set no-fast-math for reduction_over_axis too --- dpctl/tensor/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index f9cdc83e92..aba009411d 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -64,6 +64,7 @@ set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ) foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) From 30b49b1840c7f20c96ca157b0608f4e45ddb4b6d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 8 Oct 2023 13:15:06 -0500 Subject: [PATCH 26/83] Channel OVERRIDE_INTEL_IPO env. variable Set variable in public CI to override using interprocedural optimization in public CI to avoid insufficient resources failure during compilation on Windows. --- .github/workflows/conda-package.yml | 2 ++ conda-recipe/bld.bat | 5 +++++ conda-recipe/meta.yaml | 1 + 3 files changed, 8 insertions(+) diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index e715c444c0..dd9389c40d 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -102,6 +102,8 @@ jobs: run: | echo "WHEELS_OUTPUT_FOLDER=$GITHUB_WORKSPACE${{ runner.os == 'Linux' && '/' || '\\' }}" >> $GITHUB_ENV - name: Build conda package + env: + OVERRIDE_INTEL_IPO: 1 # IPO requires more resources that GH actions VM provides run: conda build --no-test --python ${{ matrix.python }} -c intel -c conda-forge --override-channels conda-recipe - name: Upload artifact uses: actions/upload-artifact@v3 diff --git a/conda-recipe/bld.bat b/conda-recipe/bld.bat index 07fa580bb4..e92e505b63 100644 --- a/conda-recipe/bld.bat +++ b/conda-recipe/bld.bat @@ -6,6 +6,11 @@ set "INCLUDE=%BUILD_PREFIX%\include;%INCLUDE%" "%PYTHON%" setup.py clean --all set "SKBUILD_ARGS=-G Ninja -- -DCMAKE_C_COMPILER:PATH=icx -DCMAKE_CXX_COMPILER:PATH=icx -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON" +REM Overriding IPO is useful for building in resources constrained VMs (public CI) +if DEFINED OVERRIDE_INTEL_IPO ( + set "SKBUILD_ARGS=%SKBUILD_ARGS% -DCMAKE_INTERPROCEDURAL_OPTIMIZATION:BOOL=FALSE" +) + FOR %%V IN (14.0.0 14 15.0.0 15 16.0.0 16 17.0.0 17) DO @( REM set DIR_HINT if directory exists IF EXIST "%BUILD_PREFIX%\Library\lib\clang\%%V\" ( diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 7b6f071610..2806fb9262 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -9,6 +9,7 @@ build: number: {{ GIT_DESCRIBE_NUMBER }} script_env: - WHEELS_OUTPUT_FOLDER + - OVERRIDE_INTEL_IPO # [win] requirements: build: From 550b20e17b7c1bf2b89b1b47cba341acd77c5515 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 9 Oct 2023 14:12:20 -0500 Subject: [PATCH 27/83] _device_queries extension needs SYCL --- dpctl/utils/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt index 8bc65e3056..aadc1c0fe0 100644 --- a/dpctl/utils/CMakeLists.txt +++ b/dpctl/utils/CMakeLists.txt @@ -16,9 +16,11 @@ add_custom_target(_dpctl4pybind11_header_ready ) set(python_module_name _device_queries) +set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/src/device_queries.cpp) pybind11_add_module(${python_module_name} MODULE - ${CMAKE_CURRENT_SOURCE_DIR}/src/device_queries.cpp + ${_module_src} ) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src}) target_include_directories(${python_module_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../include From 6aa8023d0d2c68fa140f0cc982651f0a8437412f Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Mon, 9 Oct 2023 14:16:09 -0700 Subject: [PATCH 28/83] Fixes gh-1432 (#1433) Caused by a typo in the Python binding changes made in #1427 Added a test for correct behavior --- dpctl/tensor/libtensor/source/repeat.cpp | 4 +--- dpctl/tests/test_usm_ndarray_manipulation.py | 6 ++++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp index 391f995feb..f3a20cbbaa 100644 --- a/dpctl/tensor/libtensor/source/repeat.cpp +++ b/dpctl/tensor/libtensor/source/repeat.cpp @@ -136,7 +136,6 @@ py_repeat_by_sequence(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool same_orthog_dims(true); size_t orthog_nelems(1); // number of orthogonal iterations - for (auto i = 0; i < axis; ++i) { auto src_sh_i = src_shape[i]; orthog_nelems *= src_sh_i; @@ -554,7 +553,6 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, const py::ssize_t *dst_shape = dst.get_shape_raw(); bool same_orthog_dims(true); size_t orthog_nelems(1); // number of orthogonal iterations - for (auto i = 0; i < axis; ++i) { auto src_sh_i = src_shape[i]; orthog_nelems *= src_sh_i; @@ -634,7 +632,7 @@ py_repeat_by_scalar(const dpctl::tensor::usm_ndarray &src, assert(dst_shape_vec.size() == 1); assert(dst_strides_vec.size() == 1); - if (src_nd > 0) { + if (src_nd == 0) { src_shape_vec = {0}; src_strides_vec = {0}; } diff --git a/dpctl/tests/test_usm_ndarray_manipulation.py b/dpctl/tests/test_usm_ndarray_manipulation.py index ae32afdba9..f3704274d4 100644 --- a/dpctl/tests/test_usm_ndarray_manipulation.py +++ b/dpctl/tests/test_usm_ndarray_manipulation.py @@ -1170,6 +1170,12 @@ def test_repeat_axes(): res = dpt.repeat(x, reps, axis=1) assert dpt.all(res == expected_res) + x = dpt.arange(10, dtype="i4") + expected_res = dpt.empty(x.shape[0] * reps, x.dtype) + expected_res[::2], expected_res[1::2] = x, x + res = dpt.repeat(x, reps, axis=0) + assert dpt.all(res == expected_res) + def test_repeat_size_0_outputs(): get_queue_or_skip() From e88583887cb2bae046182235c5a76b043f186eab Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Tue, 10 Oct 2023 18:23:04 -0700 Subject: [PATCH 29/83] Dedicated kernels for in-place ``dpt.divide`` and ``dpt.floor_divide`` (#1431) * Implements dedicated kernels for in-place division Includes floor division and true division * Adds tests for inplace division behavior * Adds a `static_assert` check to TrueDivideInplaceTypeMapFactory Checks that the result type is either the same as the third template parameter, or none Adds a comment to TrueDivideInplaceOutputType --- dpctl/tensor/_elementwise_funcs.py | 2 + .../elementwise_functions/floor_divide.hpp | 196 +++++++++++++-- .../elementwise_functions/true_divide.hpp | 227 ++++++++++++++++++ .../source/elementwise_functions.cpp | 112 +++++++++ dpctl/tests/elementwise/test_divide.py | 71 +++++- dpctl/tests/elementwise/test_floor_divide.py | 72 +++++- 6 files changed, 657 insertions(+), 23 deletions(-) diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index 8e2abee837..259443f8e3 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -590,6 +590,7 @@ ti._divide_result_type, ti._divide, _divide_docstring_, + binary_inplace_fn=ti._divide_inplace, acceptance_fn=_acceptance_fn_divide, ) @@ -720,6 +721,7 @@ ti._floor_divide_result_type, ti._floor_divide, _floor_divide_docstring_, + binary_inplace_fn=ti._floor_divide_inplace, ) # B11: ==== GREATER (x1, x2) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index ad75924070..025d7e8bc4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -57,12 +57,7 @@ struct FloorDivideFunctor resT operator()(const argT1 &in1, const argT2 &in2) const { - if constexpr (std::is_same_v && - std::is_same_v) { - return (in2) ? static_cast(in1) : resT(0); - } - else if constexpr (std::is_integral_v || - std::is_integral_v) { + if constexpr (std::is_integral_v || std::is_integral_v) { if (in2 == argT2(0)) { return resT(0); } @@ -87,16 +82,7 @@ struct FloorDivideFunctor operator()(const sycl::vec &in1, const sycl::vec &in2) const { - if constexpr (std::is_same_v && - std::is_same_v) { - sycl::vec res; -#pragma unroll - for (int i = 0; i < vec_sz; ++i) { - res[i] = (in2[i]) ? static_cast(in1[i]) : resT(0); - } - return res; - } - else if constexpr (std::is_integral_v) { + if constexpr (std::is_integral_v) { sycl::vec res; #pragma unroll for (int i = 0; i < vec_sz; ++i) { @@ -165,7 +151,6 @@ template struct FloorDivideOutputType { using value_type = typename std::disjunction< // disjunction is C++17 // feature, supported by DPC++ - td_ns::BinaryTypeMapResultEntry, td_ns::BinaryTypeMapResultEntry struct FloorDivideInplaceFunctor +{ + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + void operator()(resT &in1, const argT &in2) const + { + if constexpr (std::is_integral_v) { + if (in2 == argT(0)) { + in1 = 0; + return; + } + if constexpr (std::is_signed_v) { + auto tmp = in1; + in1 /= in2; + auto mod = tmp % in2; + auto corr = (mod != 0 && l_xor(mod < 0, in2 < 0)); + in1 -= corr; + } + else { + in1 /= in2; + } + } + else { + in1 /= in2; + if (in1 == resT(0)) { + return; + } + in1 = std::floor(in1); + } + } + + template + void operator()(sycl::vec &in1, + const sycl::vec &in2) const + { + if constexpr (std::is_integral_v) { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] == argT(0)) { + in1[i] = 0; + } + else { + if constexpr (std::is_signed_v) { + auto tmp = in1[i]; + in1[i] /= in2[i]; + auto mod = tmp % in2[i]; + auto corr = (mod != 0 && l_xor(mod < 0, in2[i] < 0)); + in1[i] -= corr; + } + else { + in1[i] /= in2[i]; + } + } + } + } + else { + in1 /= in2; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + if (in2[i] != argT(0)) { + in1[i] = std::floor(in1[i]); + } + } + } + } + +private: + bool l_xor(bool b1, bool b2) const + { + return (b1 != b2); + } +}; + +template +using FloorDivideInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + FloorDivideInplaceFunctor, + vec_sz, + n_vecs>; + +template +using FloorDivideInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + FloorDivideInplaceFunctor>; + +template +class floor_divide_inplace_contig_kernel; + +template +sycl::event +floor_divide_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, FloorDivideInplaceContigFunctor, + floor_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct FloorDivideInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename FloorDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_inplace_contig_impl; + return fn; + } + } +}; + +template +class floor_divide_inplace_strided_kernel; + +template +sycl::event floor_divide_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, FloorDivideInplaceStridedFunctor, + floor_divide_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct FloorDivideInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename FloorDivideOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = floor_divide_inplace_strided_impl; + return fn; + } + } +}; + } // namespace floor_divide } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index 9f488e6598..138f7a3f91 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -370,6 +370,233 @@ struct TrueDivideContigRowContigMatrixBroadcastFactory } }; +template struct TrueDivideInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) + { + res /= in; + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + res /= in; + } +}; + +// cannot use the out of place table, as it permits real lhs and complex rhs +// T1 corresponds to the type of the rhs, while T2 corresponds to the lhs +// the type of the result must be the same as T2 +template struct TrueDivideInplaceOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + T2, + std::complex, + std::complex>, + td_ns::BinaryTypeMapResultEntry, + std::complex>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +struct TrueDivideInplaceTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename TrueDivideInplaceOutputType::value_type; + static_assert(std::is_same_v || std::is_same_v); + return td_ns::GetTypeid{}.get(); + } +}; + +template +using TrueDivideInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + TrueDivideInplaceFunctor, + vec_sz, + n_vecs>; + +template +using TrueDivideInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + TrueDivideInplaceFunctor>; + +template +class true_divide_inplace_contig_kernel; + +template +sycl::event +true_divide_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, TrueDivideInplaceContigFunctor, + true_divide_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct TrueDivideInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_contig_impl; + return fn; + } + } +}; + +template +class true_divide_inplace_strided_kernel; + +template +sycl::event true_divide_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, TrueDivideInplaceStridedFunctor, + true_divide_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct TrueDivideInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_strided_impl; + return fn; + } + } +}; + +template +class true_divide_inplace_row_matrix_broadcast_sg_krn; + +template +using TrueDivideInplaceRowMatrixBroadcastingFunctor = + elementwise_common::BinaryInplaceRowMatrixBroadcastingFunctor< + argT, + resT, + TrueDivideInplaceFunctor>; + +template +sycl::event true_divide_inplace_row_matrix_broadcast_impl( + sycl::queue &exec_q, + std::vector &host_tasks, + size_t n0, + size_t n1, + const char *vec_p, // typeless pointer to (n1,) contiguous row + py::ssize_t vec_offset, + char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix + py::ssize_t mat_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_row_matrix_broadcast_impl< + argT, resT, TrueDivideInplaceRowMatrixBroadcastingFunctor, + true_divide_inplace_row_matrix_broadcast_sg_krn>( + exec_q, host_tasks, n0, n1, vec_p, vec_offset, mat_p, mat_offset, + depends); +} + +template +struct TrueDivideInplaceRowMatrixBroadcastFactory +{ + fnT get() + { + using resT = typename TrueDivideInplaceOutputType::value_type; + if constexpr (!std::is_same_v) { + fnT fn = nullptr; + return fn; + } + else { + if constexpr (dpctl::tensor::type_utils::is_complex::value || + dpctl::tensor::type_utils::is_complex::value) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = true_divide_inplace_row_matrix_broadcast_impl; + return fn; + } + } + } +}; + } // namespace true_divide } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp index cca0ac7c0a..3cca479a3f 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp @@ -933,6 +933,8 @@ namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide; static binary_contig_impl_fn_ptr_t true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; static binary_strided_impl_fn_ptr_t true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; @@ -947,6 +949,16 @@ static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t true_divide_contig_row_contig_matrix_broadcast_dispatch_table [td_ns::num_types][td_ns::num_types]; +static binary_inplace_contig_impl_fn_ptr_t + true_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + true_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + void populate_true_divide_dispatch_tables(void) { using namespace td_ns; @@ -990,6 +1002,33 @@ void populate_true_divide_dispatch_tables(void) dtb5; dtb5.populate_dispatch_table( true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + + // which input types are supported, and what is the type of the result + using fn_ns::TrueDivideInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(true_divide_inplace_output_id_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::TrueDivideInplaceStridedFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::TrueDivideInplaceContigFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb9; + dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table); }; } // namespace impl @@ -1151,6 +1190,13 @@ static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types]; static binary_strided_impl_fn_ptr_t floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_contig_impl_fn_ptr_t + floor_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + floor_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + void populate_floor_divide_dispatch_tables(void) { using namespace td_ns; @@ -1174,6 +1220,20 @@ void populate_floor_divide_dispatch_tables(void) num_types> dtb3; dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::FloorDivideInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::FloorDivideInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table); }; } // namespace impl @@ -3379,6 +3439,33 @@ void init_elementwise_functions(py::module_ m) py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); m.def("_divide_result_type", divide_result_type_pyapi, ""); + + using impl::true_divide_inplace_contig_dispatch_table; + using impl::true_divide_inplace_output_id_table; + using impl::true_divide_inplace_row_matrix_dispatch_table; + using impl::true_divide_inplace_strided_dispatch_table; + + auto divide_inplace_pyapi = + [&](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, + const std::vector &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, + true_divide_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + true_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + true_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + true_divide_inplace_row_matrix_dispatch_table); + }; + m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); } // B09: ==== EQUAL (x1, x2) @@ -3531,6 +3618,31 @@ void init_elementwise_functions(py::module_ m) py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, ""); + + using impl::floor_divide_inplace_contig_dispatch_table; + using impl::floor_divide_inplace_strided_dispatch_table; + + auto floor_divide_inplace_pyapi = + [&](const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, + const std::vector &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, floor_divide_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + floor_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + floor_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); } // B11: ==== GREATER (x1, x2) diff --git a/dpctl/tests/elementwise/test_divide.py b/dpctl/tests/elementwise/test_divide.py index 41aac736d7..a54060792c 100644 --- a/dpctl/tests/elementwise/test_divide.py +++ b/dpctl/tests/elementwise/test_divide.py @@ -21,9 +21,16 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported -from .utils import _all_dtypes, _compare_dtypes, _usm_types +from .utils import ( + _all_dtypes, + _compare_dtypes, + _complex_fp_dtypes, + _real_fp_dtypes, + _usm_types, +) @pytest.mark.parametrize("op1_dtype", _all_dtypes) @@ -187,3 +194,65 @@ def __sycl_usm_array_interface__(self): c = Canary() with pytest.raises(ValueError): dpt.divide(a, c) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes + _complex_fp_dtypes) +def test_divide_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "f": + X /= float(1) + elif dt_kind == "c": + X /= complex(1) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes) +@pytest.mark.parametrize("op2_dtype", _all_dtypes) +def test_divide_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # out array only valid if it is inexact + if ( + _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64) + and dpt.dtype(op1_dtype).kind in "fc" + ): + ar1 /= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 /= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 /= ar2 + dpt.divide(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if ( + _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64) + and dpt.dtype(op2_dtype).kind in "fc" + ): + dpt.divide(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.divide(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.divide(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_floor_divide.py b/dpctl/tests/elementwise/test_floor_divide.py index c8ba5e80f1..b57c006cdf 100644 --- a/dpctl/tests/elementwise/test_floor_divide.py +++ b/dpctl/tests/elementwise/test_floor_divide.py @@ -21,13 +21,19 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported -from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types +from .utils import ( + _compare_dtypes, + _integral_dtypes, + _no_complex_dtypes, + _usm_types, +) -@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) -@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) def test_floor_divide_dtype_matrix(op1_dtype, op2_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(op1_dtype, q) @@ -133,7 +139,7 @@ def test_floor_divide_broadcasting(): assert (dpt.asnumpy(r2) == expected2.astype(r2.dtype)).all() -@pytest.mark.parametrize("arr_dt", _no_complex_dtypes) +@pytest.mark.parametrize("arr_dt", _no_complex_dtypes[1:]) def test_floor_divide_python_scalar(arr_dt): q = get_queue_or_skip() skip_if_dtype_not_supported(arr_dt, q) @@ -204,7 +210,7 @@ def test_floor_divide_gh_1247(): ) -@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:9]) +@pytest.mark.parametrize("dtype", _integral_dtypes) def test_floor_divide_integer_zero(dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(dtype, q) @@ -255,3 +261,59 @@ def test_floor_divide_special_cases(): res = dpt.floor_divide(x, y) res_np = np.floor_divide(dpt.asnumpy(x), dpt.asnumpy(y)) np.testing.assert_array_equal(dpt.asnumpy(res), res_np) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:]) +def test_divide_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X //= int(1) + elif dt_kind == "f": + X //= float(1) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_floor_divide_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # out array only valid if it is inexact + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 //= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 //= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 //= ar2 + dpt.floor_divide(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.floor_divide(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.floor_divide(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.floor_divide(ar1, ar2, out=ar2) From 4a0c17129dbf5969ac89293c6096f92274337314 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 11 Oct 2023 07:04:30 -0500 Subject: [PATCH 30/83] SyclEvent._wait static function to use _Wait, not _WaitAndThrow --- dpctl/_sycl_event.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/_sycl_event.pyx b/dpctl/_sycl_event.pyx index 34576a2ef7..3b8f89d4c5 100644 --- a/dpctl/_sycl_event.pyx +++ b/dpctl/_sycl_event.pyx @@ -218,7 +218,7 @@ cdef class SyclEvent(_SyclEvent): @staticmethod cdef void _wait(SyclEvent event): - with nogil: DPCTLEvent_WaitAndThrow(event._event_ref) + with nogil: DPCTLEvent_Wait(event._event_ref) @staticmethod def wait_for(event): From 058107d00f9a2b006612b26409d92ea62464c057 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Wed, 11 Oct 2023 10:55:35 -0500 Subject: [PATCH 31/83] SyclContext and SyclQueue creation now use NULL async_handler pointer The default_async_handler function pointer used previously was not useful, since the integral argument passed to it by libSyclInterface did not convey useful information to take meaningful actions, and raising SyclAsyncErrorException is not helpful. --- dpctl/_sycl_context.pyx | 7 ++----- dpctl/_sycl_queue.pxd | 2 -- dpctl/_sycl_queue.pyx | 19 +++---------------- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/dpctl/_sycl_context.pyx b/dpctl/_sycl_context.pyx index 71c2fb7591..a9873fa0de 100644 --- a/dpctl/_sycl_context.pyx +++ b/dpctl/_sycl_context.pyx @@ -48,7 +48,6 @@ from ._backend cimport ( # noqa: E211 error_handler_callback, ) from ._sycl_device cimport SyclDevice -from ._sycl_queue cimport default_async_error_handler from ._sycl_device import SyclDeviceCreationError __all__ = [ @@ -201,8 +200,7 @@ cdef class SyclContext(_SyclContext): cdef int _init_context_from_one_device(self, SyclDevice device, int props): cdef DPCTLSyclDeviceRef DRef = device.get_device_ref() cdef DPCTLSyclContextRef CRef = NULL - cdef error_handler_callback * eh_callback = ( - &default_async_error_handler) + cdef error_handler_callback * eh_callback = NULL # look up cached contexts for root devices first CRef = DPCTLDeviceMgr_GetCachedContext(DRef) if (CRef is NULL): @@ -219,8 +217,7 @@ cdef class SyclContext(_SyclContext): cdef int j = 0 cdef size_t num_bytes cdef DPCTLDeviceVectorRef DVRef = NULL - cdef error_handler_callback * eh_callback = ( - &default_async_error_handler) + cdef error_handler_callback * eh_callback = NULL cdef DPCTLSyclContextRef CRef = NULL cdef DPCTLSyclDeviceRef *elems diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index c906ada4d6..8f9028fabf 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -29,8 +29,6 @@ from ._sycl_event cimport SyclEvent from .program._program cimport SyclKernel -cdef void default_async_error_handler(int) except * nogil - cdef public api class _SyclQueue [ object Py_SyclQueueObject, type Py_SyclQueueType ]: diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 6acf3396e1..361b9d5924 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -56,7 +56,6 @@ from ._backend cimport ( # noqa: E211 _arg_data_type, _backend_type, _queue_property_type, - error_handler_callback, ) from .memory._memory cimport _Memory @@ -114,18 +113,6 @@ cdef class SyclQueueCreationError(Exception): pass -cdef class SyclAsynchronousError(Exception): - """ - A SyclAsynchronousError exception is raised when SYCL operation submission - or execution encounters an error. - """ - - -cdef void default_async_error_handler(int err) except * nogil: - with gil: - raise SyclAsynchronousError(err) - - cdef int _parse_queue_properties(object prop) except *: cdef int res = 0 cdef object props @@ -404,7 +391,7 @@ cdef class SyclQueue(_SyclQueue): QRef = DPCTLQueue_Create( CRef, DRef, - &default_async_error_handler, + NULL, props ) if QRef is NULL: @@ -481,7 +468,7 @@ cdef class SyclQueue(_SyclQueue): QRef = DPCTLQueue_Create( CRef, DRef, - &default_async_error_handler, + NULL, props ) if (QRef is NULL): @@ -566,7 +553,7 @@ cdef class SyclQueue(_SyclQueue): qref = DPCTLQueue_Create( cref, dref, - &default_async_error_handler, + NULL, props ) if qref is NULL: From 20a74fe0c238f35fe81d1ecebf65d29e034c6bf6 Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Wed, 11 Oct 2023 13:06:08 -0400 Subject: [PATCH 32/83] Upload wheel to the same version --- .github/workflows/conda-package.yml | 10 ++++++++-- conda-recipe/bld.bat | 2 +- conda-recipe/build.sh | 6 +----- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index dd9389c40d..032c38c5e2 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -337,6 +337,8 @@ jobs: run: conda install anaconda-client - name: Add conda to system path run: echo $CONDA/bin >> $GITHUB_PATH + - name: Package version + run: echo "PACKAGE_VERSION=$(basename ${{ env.PACKAGE_NAME }}-*.tar.bz2 | sed 's/^${{ env.PACKAGE_NAME }}-\([^-]*\).*/\1/')" >> $GITHUB_ENV - name: Upload env: @@ -347,7 +349,7 @@ jobs: - name: Upload Wheels env: ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} - run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl + run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl --version ${{ env.PACKAGE_VERSION }} upload_windows: needs: test_windows @@ -375,6 +377,10 @@ jobs: - name: Install anaconda-client run: conda install anaconda-client + - name: Package version + shell: bash -el {0} + run: echo "PACKAGE_VERSION=$(basename ${{ env.PACKAGE_NAME }}-*.tar.bz2 | sed 's/^${{ env.PACKAGE_NAME }}-\([^-]*\).*/\1/')" >> $GITHUB_ENV + - name: Upload env: ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} @@ -384,7 +390,7 @@ jobs: - name: Upload Wheels env: ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} - run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl + run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl --version ${{ env.PACKAGE_VERSION }} test_examples_linux: needs: build_linux diff --git a/conda-recipe/bld.bat b/conda-recipe/bld.bat index e92e505b63..e87254b446 100644 --- a/conda-recipe/bld.bat +++ b/conda-recipe/bld.bat @@ -34,7 +34,7 @@ if EXIST "%PLATFORM_DIR%" ( if NOT "%WHEELS_OUTPUT_FOLDER%"=="" ( rem Install and assemble wheel package from the build bits - "%PYTHON%" setup.py install bdist_wheel %SKBUILD_ARGS% + "%PYTHON%" setup.py install bdist_wheel --build-number %GIT_DESCRIBE_NUMBER% %SKBUILD_ARGS% if errorlevel 1 exit 1 copy dist\dpctl*.whl %WHEELS_OUTPUT_FOLDER% if errorlevel 1 exit 1 diff --git a/conda-recipe/build.sh b/conda-recipe/build.sh index 87155d3fb3..f21660ec50 100755 --- a/conda-recipe/build.sh +++ b/conda-recipe/build.sh @@ -17,11 +17,7 @@ echo "${PYTHON} setup.py install ${SKBUILD_ARGS}" if [ -n "${WHEELS_OUTPUT_FOLDER}" ]; then # Install packages and assemble wheel package from built bits - if [ "$CONDA_PY" == "36" ]; then - WHEELS_BUILD_ARGS="-p manylinux1_x86_64" - else - WHEELS_BUILD_ARGS="-p manylinux2014_x86_64" - fi + WHEELS_BUILD_ARGS="-p manylinux2014_x86_64 --build-number ${GIT_DESCRIBE_NUMBER}" ${PYTHON} setup.py install bdist_wheel ${WHEELS_BUILD_ARGS} ${SKBUILD_ARGS} cp dist/dpctl*.whl ${WHEELS_OUTPUT_FOLDER} else From e1e2dbac4a51a685958299396537d7f86ed3502e Mon Sep 17 00:00:00 2001 From: Yevhenii Havrylko Date: Thu, 12 Oct 2023 16:55:32 -0400 Subject: [PATCH 33/83] Add clean up job --- .github/workflows/conda-package.yml | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 032c38c5e2..b09be78b08 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -392,6 +392,36 @@ jobs: ANACONDA_TOKEN: ${{ secrets.ANACONDA_TOKEN }} run: anaconda --token ${{ env.ANACONDA_TOKEN }} upload --user dppy --label dev ${{ env.PACKAGE_NAME }}-*.whl --version ${{ env.PACKAGE_VERSION }} + cleanup_packages: + name: Clean up anaconda packages + needs: [upload_linux, upload_windows] + runs-on: 'ubuntu-latest' + defaults: + run: + shell: bash -el {0} + steps: + - uses: conda-incubator/setup-miniconda@v2 + with: + run-post: false + channel-priority: "disabled" + channels: conda-forge + python-version: '3.11' + + - name: Install anaconda-client + run: conda install anaconda-client + + - name: Checkout repo + uses: actions/checkout@v2 + with: + repository: IntelPython/devops-tools + fetch-depth: 0 + + - name: Cleanup old packages + run: | + python scripts/cleanup-old-packages.py \ + --verbose --force --token ${{ secrets.ANACONDA_TOKEN }} \ + --package dppy/${{ env.PACKAGE_NAME }} --label dev + test_examples_linux: needs: build_linux runs-on: ${{ matrix.runner }} From 67c7dbfdebc30c66313fa20486e0972299df4dc2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 13 Oct 2023 11:50:18 -0500 Subject: [PATCH 34/83] Resolves setuptools "Package would be ignored" warnings Removed MANIFEST.in, set include_package_data=False, added folders and files to package_data keyword of setup command. --- MANIFEST.in | 23 ----------------------- setup.py | 26 ++++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 25 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index d72914b30b..0000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,23 +0,0 @@ -recursive-include dpctl/include *.h -recursive-include dpctl/include *.hpp -include dpctl/include/dpctl4pybind11.hpp -recursive-include dpctl *.pxd -recursive-include dpctl *.cmake -include dpctl/_sycl_context.h -include dpctl/_sycl_context_api.h -include dpctl/_sycl_device.h -include dpctl/_sycl_device_api.h -include dpctl/_sycl_queue.h -include dpctl/_sycl_queue_api.h -include dpctl/_sycl_event.h -include dpctl/_sycl_event_api.h -include dpctl/memory/_memory.h -include dpctl/memory/_memory_api.h -include dpctl/program/_program.h -include dpctl/program/_program_api.h -include dpctl/tensor/_usmarray.h -include dpctl/tensor/_usmarray_api.h -recursive-include dpctl/tensor/include * -recursive-include dpctl/tensor/libtensor/include * -include dpctl/tests/input_files/* -include dpctl/tests/*.pyx diff --git a/setup.py b/setup.py index f6780633cc..eb942a71b4 100644 --- a/setup.py +++ b/setup.py @@ -176,9 +176,31 @@ def _get_cmdclass(): "dpctl.utils", ], package_data={ - "dpctl": ["tests/*.*", "tests/helper/*.py", "tests/elementwise/*.py"] + "dpctl": [ + "tests/*.*", + "tests/helper/*.py", + "tests/elementwise/*.py", + "tests/*.pyx", + "tests/input_files/*", + "resources/cmake/*.cmake", + "include/*.h*", + "include/syclinterface/*.h*", + "include/syclinterface/Config/*.h", + "include/syclinterface/Support/*.h", + "tensor/libtensor/include/kernels/*.h*", + "tensor/libtensor/include/utils/*.h*", + "tensor/include/dlpack/*.*", + "_sycl*.h", + "memory/_memory*.h", + "program/_program*.h", + "tensor/_usmarray*.h", + "*.pxd", + "memory/*.pxd", + "tensor/*.pxd", + "program/*.pxd", + ] }, - include_package_data=True, + include_package_data=False, zip_safe=False, setup_requires=["Cython"], install_requires=[ From c9cc505ff201f32ed68be45dc1596f5eb75cf2dc Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 17 Oct 2023 01:32:44 -0500 Subject: [PATCH 35/83] Tweaked test_intel_device_info --- dpctl/tests/test_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py index 1aab7fd7e7..05b2dc7890 100644 --- a/dpctl/tests/test_utils.py +++ b/dpctl/tests/test_utils.py @@ -21,7 +21,6 @@ import dpctl import dpctl.utils -from dpctl.enum_types import backend_type def test_get_execution_queue_input_validation(): @@ -132,9 +131,7 @@ def test_intel_device_info(): pytest.skip("Default device could not be created") descr = dpctl.utils.intel_device_info(d) assert isinstance(descr, dict) - assert ("device_id" in descr) or ( - not d.has_aspect_cpu and not d.backend == backend_type.level_zero - ) + assert ("device_id" in descr) or not descr allowed_names = [ "device_id", "gpu_slices", From 2d2f235bd49aff9ff186811996a794ba52cdbd41 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Tue, 17 Oct 2023 08:48:13 -0700 Subject: [PATCH 36/83] Elementwise functions cbrt, exp2, copysign, and rsqrt (#1443) * Implements dpctl.tensor.cbrt * Implements copysign and exp2 elementwise funcs * Adds tests for cbrt, copysign, exp2 * Implements rsqrt and tests for rsqrt * Modified tests for cbrt, copysign, and rsqrt Now test more type combinations/output types --- dpctl/tensor/__init__.py | 8 + dpctl/tensor/_elementwise_funcs.py | 113 ++++++++ .../kernels/elementwise_functions/cbrt.hpp | 172 ++++++++++++ .../elementwise_functions/copysign.hpp | 215 +++++++++++++++ .../kernels/elementwise_functions/exp2.hpp | 229 +++++++++++++++ .../kernels/elementwise_functions/rsqrt.hpp | 179 ++++++++++++ .../source/elementwise_functions.cpp | 261 +++++++++++++++++- dpctl/tests/elementwise/test_cbrt.py | 79 ++++++ dpctl/tests/elementwise/test_copysign.py | 111 ++++++++ dpctl/tests/elementwise/test_exp2.py | 168 +++++++++++ dpctl/tests/elementwise/test_rsqrt.py | 74 +++++ 11 files changed, 1608 insertions(+), 1 deletion(-) create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp create mode 100644 dpctl/tests/elementwise/test_cbrt.py create mode 100644 dpctl/tests/elementwise/test_copysign.py create mode 100644 dpctl/tests/elementwise/test_exp2.py create mode 100644 dpctl/tests/elementwise/test_rsqrt.py diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 3473d5cde5..bab31379b7 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -110,13 +110,16 @@ bitwise_or, bitwise_right_shift, bitwise_xor, + cbrt, ceil, conj, + copysign, cos, cosh, divide, equal, exp, + exp2, expm1, floor, floor_divide, @@ -149,6 +152,7 @@ real, remainder, round, + rsqrt, sign, signbit, sin, @@ -314,4 +318,8 @@ "argmax", "argmin", "prod", + "cbrt", + "exp2", + "copysign", + "rsqrt", ] diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index 259443f8e3..24ae7fa8cf 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -1761,3 +1761,116 @@ hypot = BinaryElementwiseFunc( "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_ ) + + +# U37: ==== CBRT (x) +_cbrt_docstring_ = """ +cbrt(x, out=None, order='K') + +Computes positive cube-root for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a real floating-point data type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise positive cube-root. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +cbrt = UnaryElementwiseFunc( + "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_ +) + + +# U38: ==== EXP2 (x) +_exp2_docstring_ = """ +exp2(x, out=None, order='K') + +Computes the base-2 exponential for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a floating-point data type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise base-2 exponentials. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +exp2 = UnaryElementwiseFunc( + "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_ +) + + +# B25: ==== COPYSIGN (x1, x2) +_copysign_docstring_ = """ +copysign(x1, x2, out=None, order='K') + +Composes a floating-point value with the magnitude of `x1_i` and the sign of +`x2_i` for each element of input arrays `x1` and `x2`. + +Args: + x1 (usm_ndarray): + First input array, expected to have a real floating-point data type. + x2 (usm_ndarray): + Second input array, also expected to have a real floating-point data + type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise results. The data type + of the returned array is determined by the Type Promotion Rules. +""" +copysign = BinaryElementwiseFunc( + "copysign", + ti._copysign_result_type, + ti._copysign, + _copysign_docstring_, +) + + +# U39: ==== RSQRT (x) +_rsqrt_docstring_ = """ +rsqrt(x, out=None, order='K') + +Computes the reciprocal square-root for each element `x_i` for input array `x`. + +Args: + x (usm_ndarray): + Input array, expected to have a real floating-point data type. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is `None`. + Default: "K". +Returns: + usm_narray: + An array containing the element-wise reciprocal square-root. + The data type of the returned array is determined by + the Type Promotion Rules. +""" + +rsqrt = UnaryElementwiseFunc( + "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_ +) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp new file mode 100644 index 0000000000..1d4aa65002 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -0,0 +1,172 @@ +//=== cbrt.hpp - Unary function CBRT ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of CBRT(x) +/// function that compute a square root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace cbrt +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +template struct CbrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::true_type; + + resT operator()(const argT &in) const + { + return sycl::cbrt(in); + } +}; + +template +using CbrtContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using CbrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct CbrtOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class cbrt_contig_kernel; + +template +sycl::event cbrt_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + return elementwise_common::unary_contig_impl< + argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); +} + +template struct CbrtContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cbrt_contig_impl; + return fn; + } + } +}; + +template struct CbrtTypeMapFactory +{ + /*! @brief get typeid for output type of std::cbrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename CbrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class cbrt_strided_kernel; + +template +sycl::event +cbrt_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct CbrtStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = cbrt_strided_impl; + return fn; + } + } +}; + +} // namespace cbrt +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp new file mode 100644 index 0000000000..b1997d06b4 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -0,0 +1,215 @@ +//=== copysign.hpp - Binary function COPYSIGN ------ *-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2) +/// function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace copysign +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; +namespace tu_ns = dpctl::tensor::type_utils; + +template struct CopysignFunctor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + resT operator()(const argT1 &in1, const argT2 &in2) const + { + return sycl::copysign(in1, in2); + } + + template + sycl::vec + operator()(const sycl::vec &in1, + const sycl::vec &in2) const + { + auto tmp = sycl::copysign(in1, in2); + if constexpr (std::is_same_v) { + return tmp; + } + else { + using dpctl::tensor::type_utils::vec_cast; + + return vec_cast( + tmp); + } + } +}; + +template +using CopysignContigFunctor = + elementwise_common::BinaryContigFunctor, + vec_sz, + n_vecs>; + +template +using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor< + argT1, + argT2, + resT, + IndexerT, + CopysignFunctor>; + +template struct CopysignOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::BinaryTypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class copysign_contig_kernel; + +template +sycl::event copysign_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_contig_impl< + argTy1, argTy2, CopysignOutputType, CopysignContigFunctor, + copysign_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p, + arg2_offset, res_p, res_offset, depends); +} + +template struct CopysignContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename CopysignOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = copysign_contig_impl; + return fn; + } + } +}; + +template struct CopysignTypeMapFactory +{ + /*! @brief get typeid for output type of divide(T1 x, T2 y) */ + std::enable_if_t::value, int> get() + { + using rT = typename CopysignOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template +class copysign_strided_kernel; + +template +sycl::event +copysign_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg1_p, + py::ssize_t arg1_offset, + const char *arg2_p, + py::ssize_t arg2_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_strided_impl< + argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor, + copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p, + arg1_offset, arg2_p, arg2_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct CopysignStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename CopysignOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = copysign_strided_impl; + return fn; + } + } +}; + +} // namespace copysign +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp new file mode 100644 index 0000000000..67ee23df48 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -0,0 +1,229 @@ +//=== exp2.hpp - Unary function EXP2 ------ +//*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of EXP2(x) function. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace exp2 +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::type_utils::is_complex; + +template struct Exp2Functor +{ + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::negation< + std::disjunction, is_complex>>; + + resT operator()(const argT &in) const + { + if constexpr (is_complex::value) { + using realT = typename argT::value_type; + + const argT tmp = in * std::log(realT(2)); + + constexpr realT q_nan = std::numeric_limits::quiet_NaN(); + + const realT x = std::real(tmp); + const realT y = std::imag(tmp); + if (std::isfinite(x)) { + if (std::isfinite(y)) { + return std::exp(tmp); + } + else { + return resT{q_nan, q_nan}; + } + } + else if (std::isnan(x)) { + /* x is nan */ + if (y == realT(0)) { + return resT{in}; + } + else { + return resT{x, q_nan}; + } + } + else { + if (!std::signbit(x)) { /* x is +inf */ + if (y == realT(0)) { + return resT{x, y}; + } + else if (std::isfinite(y)) { + return resT{x * std::cos(y), x * std::sin(y)}; + } + else { + /* x = +inf, y = +-inf || nan */ + return resT{x, q_nan}; + } + } + else { /* x is -inf */ + if (std::isfinite(y)) { + realT exp_x = std::exp(x); + return resT{exp_x * std::cos(y), exp_x * std::sin(y)}; + } + else { + /* x = -inf, y = +-inf || nan */ + return resT{0, 0}; + } + } + } + } + else { + return sycl::exp2(in); + } + } +}; + +template +using Exp2ContigFunctor = elementwise_common:: + UnaryContigFunctor, vec_sz, n_vecs>; + +template +using Exp2StridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct Exp2OutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry>, + td_ns::TypeMapResultEntry>, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class exp2_contig_kernel; + +template +sycl::event exp2_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + return elementwise_common::unary_contig_impl< + argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); +} + +template struct Exp2ContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp2_contig_impl; + return fn; + } + } +}; + +template struct Exp2TypeMapFactory +{ + /*! @brief get typeid for output type of std::exp2(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename Exp2OutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class exp2_strided_kernel; + +template +sycl::event +exp2_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct Exp2StridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = exp2_strided_impl; + return fn; + } + } +}; + +} // namespace exp2 +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp new file mode 100644 index 0000000000..de51b31c30 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -0,0 +1,179 @@ +//=== rsqrt.hpp - Unary function RSQRT ------ +//*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===---------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for elementwise evaluation of RSQRT(x) +/// function that computes the reciprocal square root. +//===---------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/elementwise_functions/common.hpp" + +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace rsqrt +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +template struct RsqrtFunctor +{ + + // is function constant for given argT + using is_constant = typename std::false_type; + // constant value, if constant + // constexpr resT constant_value = resT{}; + // is function defined for sycl::vec + using supports_vec = typename std::false_type; + // do both argTy and resTy support sugroup store/load operation + using supports_sg_loadstore = typename std::true_type; + + resT operator()(const argT &in) const + { + return sycl::rsqrt(in); + } +}; + +template +using RsqrtContigFunctor = + elementwise_common::UnaryContigFunctor, + vec_sz, + n_vecs>; + +template +using RsqrtStridedFunctor = elementwise_common:: + UnaryStridedFunctor>; + +template struct RsqrtOutputType +{ + using value_type = typename std::disjunction< // disjunction is C++17 + // feature, supported by DPC++ + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::TypeMapResultEntry, + td_ns::DefaultResultEntry>::result_type; +}; + +template +class rsqrt_contig_kernel; + +template +sycl::event rsqrt_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + char *res_p, + const std::vector &depends = {}) +{ + return elementwise_common::unary_contig_impl< + argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel>( + exec_q, nelems, arg_p, res_p, depends); +} + +template struct RsqrtContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = rsqrt_contig_impl; + return fn; + } + } +}; + +template struct RsqrtTypeMapFactory +{ + /*! @brief get typeid for output type of sycl::rsqrt(T x) */ + std::enable_if_t::value, int> get() + { + using rT = typename RsqrtOutputType::value_type; + return td_ns::GetTypeid{}.get(); + } +}; + +template class rsqrt_strided_kernel; + +template +sycl::event +rsqrt_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::unary_strided_impl< + argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template struct RsqrtStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = rsqrt_strided_impl; + return fn; + } + } +}; + +} // namespace rsqrt +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp index 3cca479a3f..043cac0cd2 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp @@ -48,12 +48,15 @@ #include "kernels/elementwise_functions/bitwise_or.hpp" #include "kernels/elementwise_functions/bitwise_right_shift.hpp" #include "kernels/elementwise_functions/bitwise_xor.hpp" +#include "kernels/elementwise_functions/cbrt.hpp" #include "kernels/elementwise_functions/ceil.hpp" #include "kernels/elementwise_functions/conj.hpp" +#include "kernels/elementwise_functions/copysign.hpp" #include "kernels/elementwise_functions/cos.hpp" #include "kernels/elementwise_functions/cosh.hpp" #include "kernels/elementwise_functions/equal.hpp" #include "kernels/elementwise_functions/exp.hpp" +#include "kernels/elementwise_functions/exp2.hpp" #include "kernels/elementwise_functions/expm1.hpp" #include "kernels/elementwise_functions/floor.hpp" #include "kernels/elementwise_functions/floor_divide.hpp" @@ -86,6 +89,7 @@ #include "kernels/elementwise_functions/real.hpp" #include "kernels/elementwise_functions/remainder.hpp" #include "kernels/elementwise_functions/round.hpp" +#include "kernels/elementwise_functions/rsqrt.hpp" #include "kernels/elementwise_functions/sign.hpp" #include "kernels/elementwise_functions/signbit.hpp" #include "kernels/elementwise_functions/sin.hpp" @@ -2749,7 +2753,6 @@ void populate_trunc_dispatch_vectors(void) } // namespace impl // B24: ==== HYPOT (x1, x2) - namespace impl { namespace hypot_fn_ns = dpctl::tensor::kernels::hypot; @@ -2788,6 +2791,151 @@ void populate_hypot_dispatch_tables(void) } // namespace impl +// U37: ==== CBRT (x) +namespace impl +{ + +namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt; + +static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types]; +static int cbrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cbrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_cbrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cbrt_fn_ns; + + using fn_ns::CbrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector); + + using fn_ns::CbrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector); + + using fn_ns::CbrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cbrt_output_typeid_vector); +} + +} // namespace impl + +// B24: ==== COPYSIGN (x1, x2) +namespace impl +{ +namespace copysign_fn_ns = dpctl::tensor::kernels::copysign; + +static binary_contig_impl_fn_ptr_t + copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int copysign_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_copysign_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = copysign_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::CopysignTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(copysign_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::CopysignStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(copysign_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::CopysignContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(copysign_contig_dispatch_table); +}; + +} // namespace impl + +// U38: ==== EXP2 (x) +namespace impl +{ + +namespace exp2_fn_ns = dpctl::tensor::kernels::exp2; + +static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types]; +static int exp2_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + exp2_strided_dispatch_vector[td_ns::num_types]; + +void populate_exp2_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = exp2_fn_ns; + + using fn_ns::Exp2ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector); + + using fn_ns::Exp2StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector); + + using fn_ns::Exp2TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(exp2_output_typeid_vector); +} + +} // namespace impl + +// U39: ==== RSQRT (x) +namespace impl +{ + +namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt; + +static unary_contig_impl_fn_ptr_t + rsqrt_contig_dispatch_vector[td_ns::num_types]; +static int rsqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + rsqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_rsqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = rsqrt_fn_ns; + + using fn_ns::RsqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector); + + using fn_ns::RsqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector); + + using fn_ns::RsqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector); +} + +} // namespace impl + // ========================================================================================== // // @@ -4889,6 +5037,117 @@ void init_elementwise_functions(py::module_ m) py::arg("depends") = py::list()); m.def("_hypot_result_type", hypot_result_type_pyapi, ""); } + + // U37: ==== CBRT (x) + { + impl::populate_cbrt_dispatch_vectors(); + using impl::cbrt_contig_dispatch_vector; + using impl::cbrt_output_typeid_vector; + using impl::cbrt_strided_dispatch_vector; + + auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cbrt_output_typeid_vector, + cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector); + }; + m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector); + }; + m.def("_cbrt_result_type", cbrt_result_type_pyapi); + } + + // B25: ==== COPYSIGN (x1, x2) + { + impl::populate_copysign_dispatch_tables(); + using impl::copysign_contig_dispatch_table; + using impl::copysign_output_id_table; + using impl::copysign_strided_dispatch_table; + + auto copysign_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, + const dpctl::tensor::usm_ndarray &src2, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends = + {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, copysign_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + copysign_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + copysign_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto copysign_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + copysign_output_id_table); + }; + m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_copysign_result_type", copysign_result_type_pyapi, ""); + } + + // U38: ==== EXP2 (x) + { + impl::populate_exp2_dispatch_vectors(); + using impl::exp2_contig_dispatch_vector; + using impl::exp2_output_typeid_vector; + using impl::exp2_strided_dispatch_vector; + + auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, exp2_output_typeid_vector, + exp2_contig_dispatch_vector, exp2_strided_dispatch_vector); + }; + m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto exp2_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector); + }; + m.def("_exp2_result_type", exp2_result_type_pyapi); + } + + // U39: ==== RSQRT (x) + { + impl::populate_rsqrt_dispatch_vectors(); + using impl::rsqrt_contig_dispatch_vector; + using impl::rsqrt_output_typeid_vector; + using impl::rsqrt_strided_dispatch_vector; + + auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, rsqrt_output_typeid_vector, + rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector); + }; + m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + rsqrt_output_typeid_vector); + }; + m.def("_rsqrt_result_type", rsqrt_result_type_pyapi); + } } } // namespace py_internal diff --git a/dpctl/tests/elementwise/test_cbrt.py b/dpctl/tests/elementwise/test_cbrt.py new file mode 100644 index 0000000000..b06a8d19cf --- /dev/null +++ b/dpctl/tests/elementwise/test_cbrt.py @@ -0,0 +1,79 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_cbrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.cbrt(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_cbrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.cbrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_cbrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.cbrt(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.usefixtures("suppress_invalid_numpy_warnings") +def test_cbrt_special_cases(): + get_queue_or_skip() + + X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = dpt.cbrt(X) + expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + tol = dpt.finfo(dpt.float32).resolution + + assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True) diff --git a/dpctl/tests/elementwise/test_copysign.py b/dpctl/tests/elementwise/test_copysign.py new file mode 100644 index 0000000000..26a285343c --- /dev/null +++ b/dpctl/tests/elementwise/test_copysign.py @@ -0,0 +1,111 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ctypes + +import numpy as np +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _compare_dtypes, _no_complex_dtypes, _real_fp_dtypes + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes) +def test_copysign_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + r = dpt.copysign(ar1, ar2) + assert isinstance(r, dpt.usm_ndarray) + expected = np.copysign( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar1.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + assert r.sycl_queue == ar1.sycl_queue + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + r = dpt.copysign(ar3[::-1], ar4[::2]) + assert isinstance(r, dpt.usm_ndarray) + expected = np.copysign( + np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype) + ) + assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q) + assert r.shape == ar3.shape + assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all() + + +@pytest.mark.parametrize("arr_dt", _real_fp_dtypes) +def test_copysign_python_scalar(arr_dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arr_dt, q) + + X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q) + py_ones = ( + bool(1), + int(1), + float(1), + np.float32(1), + ctypes.c_int(1), + ) + for sc in py_ones: + R = dpt.copysign(X, sc) + assert isinstance(R, dpt.usm_ndarray) + R = dpt.copysign(sc, X) + assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dt", _real_fp_dtypes) +def test_copysign(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.arange(100, dtype=dt, sycl_queue=q) + x[1::2] *= -1 + y = dpt.ones(100, dtype=dt, sycl_queue=q) + y[::2] *= -1 + res = dpt.copysign(x, y) + expected = dpt.negative(x) + tol = dpt.finfo(dt).resolution + assert dpt.allclose(res, expected, atol=tol, rtol=tol) + + +def test_copysign_special_values(): + get_queue_or_skip() + + x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4") + y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4") + res = dpt.copysign(x1, y1) + assert dpt.all(dpt.signbit(res)) + x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4") + res = dpt.copysign(x2, y1) + assert dpt.all(dpt.signbit(res)) + y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4") + res = dpt.copysign(x2, y2) + assert not dpt.any(dpt.signbit(res)) + res = dpt.copysign(x1, y2) + assert not dpt.any(dpt.signbit(res)) diff --git a/dpctl/tests/elementwise/test_exp2.py b/dpctl/tests/elementwise/test_exp2.py new file mode 100644 index 0000000000..d4bef1efab --- /dev/null +++ b/dpctl/tests/elementwise/test_exp2.py @@ -0,0 +1,168 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _all_dtypes, _map_to_device_dtype, _usm_types + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp2_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + X = dpt.asarray(0, dtype=dtype, sycl_queue=q) + expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.exp2(X).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_exp2_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q) + Xnp = dpt.asnumpy(X) + + Y = dpt.exp2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"]) +def test_exp2_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2 * 1027 + + X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + Xnp = dpt.asnumpy(X) + + Y = dpt.exp2(X) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol) + + +@pytest.mark.parametrize("usm_type", _usm_types) +def test_exp2_usm_type(usm_type): + q = get_queue_or_skip() + + arg_dt = np.dtype("f4") + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q) + X[..., 0::2] = 1 / 4 + X[..., 1::2] = 1 / 2 + + Y = dpt.exp2(X) + assert Y.usm_type == X.usm_type + assert Y.sycl_queue == X.sycl_queue + assert Y.flags.c_contiguous + + expected_Y = np.empty(input_shape, dtype=arg_dt) + expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4)) + expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2)) + tol = 8 * dpt.finfo(Y.dtype).resolution + + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _all_dtypes) +def test_exp2_order(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + arg_dt = np.dtype(dtype) + input_shape = (10, 10, 10, 10) + X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q) + X[..., 0::2] = 1 / 4 + X[..., 1::2] = 1 / 2 + + for ord in ["C", "F", "A", "K"]: + for perms in itertools.permutations(range(4)): + U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms) + Y = dpt.exp2(U, order=ord) + expected_Y = np.exp2(dpt.asnumpy(U)) + tol = 8 * max( + dpt.finfo(Y.dtype).resolution, + np.finfo(expected_Y.dtype).resolution, + ) + assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol) + + +def test_exp2_special_cases(): + get_queue_or_skip() + + X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4") + + tol = dpt.finfo(X.dtype).resolution + assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol) + + # special cases for complex variant + num_finite = 1.0 + vals = [ + complex(0.0, 0.0), + complex(num_finite, dpt.inf), + complex(num_finite, dpt.nan), + complex(dpt.inf, 0.0), + complex(-dpt.inf, num_finite), + complex(dpt.inf, num_finite), + complex(-dpt.inf, dpt.inf), + complex(dpt.inf, dpt.inf), + complex(-dpt.inf, dpt.nan), + complex(dpt.inf, dpt.nan), + complex(dpt.nan, 0.0), + complex(dpt.nan, num_finite), + complex(dpt.nan, dpt.nan), + ] + X = dpt.asarray(vals, dtype=dpt.complex64) + cis_1 = complex(np.cos(num_finite), np.sin(num_finite)) + c_nan = complex(np.nan, np.nan) + res = np.asarray( + [ + complex(1.0, 0.0), + c_nan, + c_nan, + complex(np.inf, 0.0), + 0.0, + np.inf * cis_1, + complex(0.0, 0.0), + complex(np.inf, np.nan), + complex(0.0, 0.0), + complex(np.inf, np.nan), + complex(np.nan, 0.0), + c_nan, + c_nan, + ], + dtype=np.complex64, + ) + + tol = dpt.finfo(X.dtype).resolution + with np.errstate(invalid="ignore"): + assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol) diff --git a/dpctl/tests/elementwise/test_rsqrt.py b/dpctl/tests/elementwise/test_rsqrt.py new file mode 100644 index 0000000000..ef9378ade2 --- /dev/null +++ b/dpctl/tests/elementwise/test_rsqrt.py @@ -0,0 +1,74 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from numpy.testing import assert_allclose + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes) +def test_rsqrt_out_type(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + x = dpt.asarray(1, dtype=dtype, sycl_queue=q) + expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype + expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device) + assert dpt.rsqrt(x).dtype == expected_dtype + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_rsqrt_output_contig(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 1027 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q) + res = dpt.rsqrt(x) + expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype)) + tol = 8 * dpt.finfo(res.dtype).resolution + assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol) + + +@pytest.mark.parametrize("dtype", _real_fp_dtypes) +def test_rsqrt_output_strided(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + n_seq = 2054 + + x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2] + res = dpt.rsqrt(x) + expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype)) + tol = 8 * dpt.finfo(res.dtype).resolution + assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol) + + +def test_rsqrt_special_cases(): + get_queue_or_skip() + + x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4") + res = dpt.rsqrt(x) + expected = dpt.asarray( + [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4" + ) + assert dpt.allclose(res, expected, equal_nan=True) From be5fb99b99720311cf8ce431d40770c15f5e100f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 7 Sep 2023 06:55:16 -0500 Subject: [PATCH 37/83] Added SyclQueue._submit_keep_args_alive method Usage: q = dpctl.SyclQueue() ... e = q.submit(krn, args, ranges) ht_e = q._submit_keep_args_alive(args, [e]) .... ht_e.wait() --- dpctl/_host_task_util.hpp | 47 ++++++++++------ dpctl/_sycl_queue.pxd | 5 ++ dpctl/_sycl_queue.pyx | 113 +++++++++++++++++++++++++++++++++----- 3 files changed, 132 insertions(+), 33 deletions(-) diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index 8db17594fd..ff360eff1c 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -31,28 +31,27 @@ #include "Python.h" #include "syclinterface/dpctl_data_types.h" +#include "syclinterface/dpctl_sycl_type_casters.hpp" #include -int async_dec_ref(DPCTLSyclQueueRef QRef, - PyObject **obj_array, - size_t obj_array_size, - DPCTLSyclEventRef *ERefs, - size_t nERefs) +DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef, + PyObject **obj_array, + size_t obj_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status) { + using dpctl::syclinterface::unwrap; + using dpctl::syclinterface::wrap; - sycl::queue *q = reinterpret_cast(QRef); + sycl::queue *q = unwrap(QRef); - std::vector obj_vec; - obj_vec.reserve(obj_array_size); - for (size_t obj_id = 0; obj_id < obj_array_size; ++obj_id) { - obj_vec.push_back(obj_array[obj_id]); - } + std::vector obj_vec(obj_array, obj_array + obj_array_size); try { - q->submit([&](sycl::handler &cgh) { - for (size_t ev_id = 0; ev_id < nERefs; ++ev_id) { - cgh.depends_on( - *(reinterpret_cast(ERefs[ev_id]))); + sycl::event ht_ev = q->submit([&](sycl::handler &cgh) { + for (size_t ev_id = 0; ev_id < nDepERefs; ++ev_id) { + cgh.depends_on(*(unwrap(depERefs[ev_id]))); } cgh.host_task([obj_array_size, obj_vec]() { // if the main thread has not finilized the interpreter yet @@ -66,9 +65,21 @@ int async_dec_ref(DPCTLSyclQueueRef QRef, } }); }); + + constexpr int result_ok = 0; + + *status = result_ok; + auto e_ptr = new sycl::event(ht_ev); + return wrap(e_ptr); } catch (const std::exception &e) { - return 1; + constexpr int result_std_exception = 1; + + *status = result_std_exception; + return nullptr; } - return 0; + constexpr int result_other_abnormal = 2; + + *status = result_other_abnormal; + return nullptr; } diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 8f9028fabf..0ce33b5ef3 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -70,6 +70,11 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef SyclContext get_sycl_context(self) cpdef SyclDevice get_sycl_device(self) cdef DPCTLSyclQueueRef get_queue_ref(self) + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ) cpdef SyclEvent submit( self, SyclKernel kernel, diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 361b9d5924..79adf7e014 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -72,7 +72,7 @@ import logging cdef extern from "_host_task_util.hpp": - int async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t) nogil + DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t, int *) nogil __all__ = [ @@ -703,6 +703,79 @@ cdef class SyclQueue(_SyclQueue): """ return self._queue_ref + + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ): + """ SyclQueue._submit_keep_args_alive(args, events) + + Keeps objects in `args` alive until tasks associated with events + complete. + + Args: + args(object): Python object to keep alive. + Typically a tuple with arguments to offloaded tasks + events(Tuple[dpctl.SyclEvent]): Gating events + The list or tuple of events associated with tasks + working on Python objects collected in `args`. + Returns: + dpctl.SyclEvent + The event associated with the submission of host task. + + Increments reference count of `args` and schedules asynchronous + ``host_task`` to decrement the count once dependent events are + complete. + + N.B.: The `host_task` attempts to acquire Python GIL, and it is + known to be unsafe during interpreter shudown sequence. It is + thus strongly advised to ensure that all submitted `host_task` + complete before the end of the Python script. + """ + cdef size_t nDE = len(dEvents) + cdef DPCTLSyclEventRef *depEvents = NULL + cdef PyObject *args_raw = NULL + cdef DPCTLSyclEventRef htERef = NULL + cdef int status = -1 + + # Create the array of dependent events if any + if nDE > 0: + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if not depEvents: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + + # increment reference counts to list of arguments + Py_INCREF(args) + + # schedule decrement + args_raw = args + + htERef = async_dec_ref( + self.get_queue_ref(), + &args_raw, 1, + depEvents, nDE, &status + ) + + free(depEvents) + if (status != 0): + with nogil: DPCTLEvent_Wait(htERef) + raise RuntimeError("Could not submit keep_args_alive host_task") + + return SyclEvent._create(htERef) + + cpdef SyclEvent submit( self, SyclKernel kernel, @@ -715,13 +788,14 @@ cdef class SyclQueue(_SyclQueue): cdef _arg_data_type *kargty = NULL cdef DPCTLSyclEventRef *depEvents = NULL cdef DPCTLSyclEventRef Eref = NULL + cdef DPCTLSyclEventRef htEref = NULL cdef int ret = 0 cdef size_t gRange[3] cdef size_t lRange[3] cdef size_t nGS = len(gS) cdef size_t nLS = len(lS) if lS is not None else 0 cdef size_t nDE = len(dEvents) if dEvents is not None else 0 - cdef PyObject **arg_objects = NULL + cdef PyObject *args_raw = NULL cdef ssize_t i = 0 # Allocate the arrays to be sent to DPCTLQueue_Submit @@ -745,7 +819,15 @@ cdef class SyclQueue(_SyclQueue): raise MemoryError() else: for idx, de in enumerate(dEvents): - depEvents[idx] = (de).get_event_ref() + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(kargs) + free(kargty) + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) # populate the args and argstype arrays ret = self._populate_args(args, kargs, kargty) @@ -823,22 +905,23 @@ cdef class SyclQueue(_SyclQueue): raise SyclKernelSubmitError( "Kernel submission to Sycl queue failed." ) - # increment reference counts to each argument - arg_objects = malloc(len(args) * sizeof(PyObject *)) - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_INCREF( arg_objects[i]) + # increment reference counts to list of arguments + Py_INCREF(args) # schedule decrement - if async_dec_ref(self.get_queue_ref(), arg_objects, len(args), &Eref, 1): + args_raw = args + + ret = -1 + htERef = async_dec_ref(self.get_queue_ref(), &args_raw, 1, &Eref, 1, &ret) + if ret: # async task submission failed, decrement ref counts and wait - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_DECREF( arg_objects[i]) - with nogil: DPCTLEvent_Wait(Eref) + Py_DECREF(args) + with nogil: + DPCTLEvent_Wait(Eref) + DPCTLEvent_Wait(htERef) - # free memory - free(arg_objects) + # we are not returning host-task event at the moment + DPCTLEvent_Delete(htERef) return SyclEvent._create(Eref) From 5b5363e95979a79a877e27604be91e28c1554389 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 7 Sep 2023 08:58:03 -0500 Subject: [PATCH 38/83] Used _submit_keep_args_alive in a test --- dpctl/tests/test_sycl_kernel_submit.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py index d15f5c8e2b..31727ce2d8 100644 --- a/dpctl/tests/test_sycl_kernel_submit.py +++ b/dpctl/tests/test_sycl_kernel_submit.py @@ -214,6 +214,9 @@ def test_async_submit(): e3_st = e3.execution_status e2_st = e2.execution_status e1_st = e1.execution_status + ht_e = q._submit_keep_args_alive( + [first_row, second_row, third_row], [e1, e2, e3] + ) are_complete = [ e == status_complete for e in ( @@ -223,6 +226,7 @@ def test_async_submit(): ) ] e3.wait() + ht_e.wait() if not all(are_complete): async_detected = True break From 0a35e9c57e4fea1c5156a76c548fae1f30063e4e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 10 Sep 2023 16:17:18 -0500 Subject: [PATCH 39/83] Do not use async_dec_ref in submit method Instead delegated the task of Python object life-time management to the user via use of _submit_keep_args_alive method --- dpctl/_sycl_queue.pyx | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 79adf7e014..3c2945d82e 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -905,23 +905,6 @@ cdef class SyclQueue(_SyclQueue): raise SyclKernelSubmitError( "Kernel submission to Sycl queue failed." ) - # increment reference counts to list of arguments - Py_INCREF(args) - - # schedule decrement - args_raw = args - - ret = -1 - htERef = async_dec_ref(self.get_queue_ref(), &args_raw, 1, &Eref, 1, &ret) - if ret: - # async task submission failed, decrement ref counts and wait - Py_DECREF(args) - with nogil: - DPCTLEvent_Wait(Eref) - DPCTLEvent_Wait(htERef) - - # we are not returning host-task event at the moment - DPCTLEvent_Delete(htERef) return SyclEvent._create(Eref) From f30a6a2efb0189ce373d0393a5e228e6be0baa7f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 10 Sep 2023 16:18:58 -0500 Subject: [PATCH 40/83] Add memcpy_async method --- dpctl/_sycl_queue.pxd | 1 + dpctl/_sycl_queue.pyx | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 0ce33b5ef3..999fb417af 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -86,6 +86,7 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef void wait(self) cdef DPCTLSyclQueueRef get_queue_ref(self) cpdef memcpy(self, dest, src, size_t count) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count) cpdef prefetch(self, ptr, size_t count=*) cpdef mem_advise(self, ptr, size_t count, int mem) cpdef SyclEvent submit_barrier(self, dependent_events=*) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 3c2945d82e..d944a25e36 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -934,6 +934,29 @@ cdef class SyclQueue(_SyclQueue): with nogil: DPCTLEvent_Wait(ERef) DPCTLEvent_Delete(ERef) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count): + cdef void *c_dest + cdef void *c_src + cdef DPCTLSyclEventRef ERef = NULL + + if isinstance(dest, _Memory): + c_dest = (<_Memory>dest).memory_ptr + else: + raise TypeError("Parameter `dest` should have type _Memory.") + + if isinstance(src, _Memory): + c_src = (<_Memory>src).memory_ptr + else: + raise TypeError("Parameter `src` should have type _Memory.") + + ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) + if (ERef is NULL): + raise RuntimeError( + "SyclQueue.memcpy operation encountered an error" + ) + + return SyclEvent._create(ERef) + cpdef prefetch(self, mem, size_t count=0): cdef void *ptr cdef DPCTLSyclEventRef ERef = NULL From e7ee1d928a4dfb5a305e49af8e152523c9b7573c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Sep 2023 02:40:59 -0500 Subject: [PATCH 41/83] Introduce dpctl.SyclQueue.submit_async The SyclQueue.submit has become synchronosing, although it still returns a SyclEvent (with exectuion_status always complete) --- dpctl/_sycl_queue.pxd | 8 ++++++++ dpctl/_sycl_queue.pyx | 14 +++++++++++++- dpctl/tests/test_sycl_kernel_submit.py | 8 ++++---- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 999fb417af..35062e0ded 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -75,6 +75,14 @@ cdef public api class SyclQueue (_SyclQueue) [ object args, list dEvents ) + cpdef SyclEvent submit_async( + self, + SyclKernel kernel, + list args, + list gS, + list lS=*, + list dEvents=* + ) cpdef SyclEvent submit( self, SyclKernel kernel, diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index d944a25e36..5fee72bc64 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -776,7 +776,7 @@ cdef class SyclQueue(_SyclQueue): return SyclEvent._create(htERef) - cpdef SyclEvent submit( + cpdef SyclEvent submit_async( self, SyclKernel kernel, list args, @@ -908,6 +908,18 @@ cdef class SyclQueue(_SyclQueue): return SyclEvent._create(Eref) + cpdef SyclEvent submit( + self, + SyclKernel kernel, + list args, + list gS, + list lS=None, + list dEvents=None + ): + cdef SyclEvent e = self.submit_async(kernel, args, gS, lS, dEvents) + e.wait() + return e + cpdef void wait(self): with nogil: DPCTLQueue_Wait(self._queue_ref) diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py index 31727ce2d8..697af32f5c 100644 --- a/dpctl/tests/test_sycl_kernel_submit.py +++ b/dpctl/tests/test_sycl_kernel_submit.py @@ -114,7 +114,7 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor): ) -def test_async_submit(): +def test_submit_async(): try: q = dpctl.SyclQueue("opencl") except dpctl.SyclQueueCreationError: @@ -182,7 +182,7 @@ def test_async_submit(): async_detected = False for attempt in range(5): - e1 = q.submit( + e1 = q.submit_async( kern1Kernel, [ first_row, @@ -192,7 +192,7 @@ def test_async_submit(): n, ], ) - e2 = q.submit( + e2 = q.submit_async( kern2Kernel, [ second_row, @@ -202,7 +202,7 @@ def test_async_submit(): n, ], ) - e3 = q.submit( + e3 = q.submit_async( kern3Kernel, [third_row, first_row, second_row], [ From b6dd4d2c02405d51b533839e0d5a04e75654ae51 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Sep 2023 12:27:02 -0500 Subject: [PATCH 42/83] Use pragma once in _host_task_util.hpp --- dpctl/_host_task_util.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index ff360eff1c..cb3828a54f 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -29,6 +29,7 @@ /// //===----------------------------------------------------------------------===// +#pragma once #include "Python.h" #include "syclinterface/dpctl_data_types.h" #include "syclinterface/dpctl_sycl_type_casters.hpp" From 306ff96e1b876f76d6a5e5ce831d3f665c470688 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Sep 2023 16:29:15 -0500 Subject: [PATCH 43/83] Fixed possible memory leak on error --- dpctl/_sycl_queue.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 5fee72bc64..9850da48e4 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -771,6 +771,7 @@ cdef class SyclQueue(_SyclQueue): free(depEvents) if (status != 0): with nogil: DPCTLEvent_Wait(htERef) + DPCTLEvent_Delete(htERef) raise RuntimeError("Could not submit keep_args_alive host_task") return SyclEvent._create(htERef) From 67a325dfada9d40e180c0f132f9183de2891802d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 17 Sep 2023 23:06:43 -0500 Subject: [PATCH 44/83] Add DPCTLQueue_MemcpyWithEvents This is the copy operation where one can specify list of events the copy operation requires before start of its execution. DPCTLQueue_MemcpyWithEvents( __dpctl_keep DPCTLSyclQueueRef QRef, void *dst, const void *src, size_t nbytes, const DPCTLSyclEventRef *depEvents, size_t nDE ) Uses this function in tests. --- .../include/dpctl_sycl_queue_interface.h | 23 ++++ .../source/dpctl_sycl_queue_interface.cpp | 45 ++++++- .../tests/test_sycl_queue_interface.cpp | 127 +++++++++++------- 3 files changed, 146 insertions(+), 49 deletions(-) diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h index 1c5e53a395..cc466fce17 100644 --- a/libsyclinterface/include/dpctl_sycl_queue_interface.h +++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h @@ -294,6 +294,29 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, const void *Src, size_t Count); +/*! + * @brief C-API wrapper for ``sycl::queue::memcpy``. + * + * @param QRef An opaque pointer to the ``sycl::queue``. + * @param Dest An USM pointer to the destination memory. + * @param Src An USM pointer to the source memory. + * @param Count A number of bytes to copy. + * @param DepEvents A pointer to array of DPCTLSyclEventRef opaque + * pointers to dependent events. + * @param DepEventsCount A number of dependent events. + * @return An opaque pointer to the ``sycl::event`` returned by the + * ``sycl::queue::memcpy`` function. + * @ingroup QueueInterface + */ +DPCTL_API +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + __dpctl_keep const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount); + /*! * @brief C-API wrapper for ``sycl::queue::prefetch``. * diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index 4903b888ff..60098ae933 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -410,9 +410,12 @@ DPCTLQueue_SubmitNDRange(__dpctl_keep const DPCTLSyclKernelRef KRef, try { e = Queue->submit([&](handler &cgh) { // Depend on any event that was specified by the caller. - if (NDepEvents) - for (auto i = 0ul; i < NDepEvents; ++i) - cgh.depends_on(*unwrap(DepEvents[i])); + if (DepEvents) + for (auto i = 0ul; i < NDepEvents; ++i) { + auto ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } for (auto i = 0ul; i < NArgs; ++i) { // \todo add support for Sycl buffers @@ -485,6 +488,42 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, } } +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount) +{ + event ev; + auto Q = unwrap(QRef); + if (Q) { + try { + ev = Q->submit([&](handler &cgh) { + if (DepEvents) + for (size_t i = 0; i < DepEventsCount; ++i) { + event *ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } + + cgh.memcpy(Dest, Src, Count); + }); + } catch (const std::exception &ex) { + error_handler(ex, __FILE__, __func__, __LINE__); + return nullptr; + } + } + else { + error_handler("QRef passed to memcpy was NULL.", __FILE__, __func__, + __LINE__); + return nullptr; + } + + return wrap(new event(ev)); +} + __dpctl_give DPCTLSyclEventRef DPCTLQueue_Prefetch(__dpctl_keep DPCTLSyclQueueRef QRef, const void *Ptr, diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp index 8d23929d39..836a87379b 100644 --- a/libsyclinterface/tests/test_sycl_queue_interface.cpp +++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp @@ -340,6 +340,10 @@ TEST(TestDPCTLSyclQueueInterface, CheckMemOpsZeroQRef) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); ASSERT_FALSE(bool(ERef)); @@ -391,6 +395,10 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemOpsNullPtr) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); if (ERef) { ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); @@ -450,6 +458,38 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset) delete[] host_arr; } +TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset2) +{ + DPCTLSyclUSMRef p = nullptr; + DPCTLSyclEventRef Memset_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; + uint8_t val = 42; + size_t nbytes = 256; + uint8_t *host_arr = new uint8_t[nbytes]; + + ASSERT_FALSE(host_arr == nullptr); + + ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); + ASSERT_FALSE(p == nullptr); + + ASSERT_NO_FATAL_FAILURE( + Memset_ERef = DPCTLQueue_Memset(QRef, (void *)p, val, nbytes)); + + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Memset_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memset_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); + + for (size_t i = 0; i < nbytes; ++i) { + ASSERT_TRUE(host_arr[i] == val); + } + delete[] host_arr; +} + TEST(TestDPCTLSyclQueueInterface, CheckFillNullQRef) { DPCTLSyclQueueRef QRef = nullptr; @@ -481,7 +521,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) { using T = uint8_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill8_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -492,17 +533,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); ASSERT_FALSE(p == nullptr); - ASSERT_NO_FATAL_FAILURE(ERef = + ASSERT_NO_FATAL_FAILURE(Fill8_ERef = DPCTLQueue_Fill8(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill8_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill8_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -517,7 +556,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) using T = uint16_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill16_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xAB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -529,16 +569,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill16_ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill16_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill16_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -553,7 +591,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) using T = uint32_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill32_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCD); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -565,16 +604,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill32_ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill32_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill32_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -589,7 +626,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) using T = uint64_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill64_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCDEF73); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -601,16 +639,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill64_ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill64_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill64_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -639,7 +675,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) using T = value128_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill128_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val{static_cast(0xABCDEF73), static_cast(0x3746AF05)}; size_t nelems = 256; T *host_arr = new T[nelems]; @@ -651,17 +688,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill128(QRef, (void *)p, - reinterpret_cast(&val), nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill128_ERef = DPCTLQueue_Fill128( + QRef, (void *)p, reinterpret_cast(&val), nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill128_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill128_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); From 6949e690553047600aa767c4c743f474f5608b7d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 18 Sep 2023 02:48:44 -0500 Subject: [PATCH 45/83] Adds dpctl.SyclQueue.memcpy_async Also extends `dpctl.SyclQueue.memcpy` to allow arguments to be objects that expose buffer protocol, allowing `dpctl.SyclQueue.memcpy` and `dpctl.SyclQueue.memcpy_async` to be used to copy from/to USM-allocation or host buffer. --- dpctl/_backend.pxd | 7 ++ dpctl/_sycl_queue.pxd | 2 +- dpctl/_sycl_queue.pyx | 108 ++++++++++++++++++++------ dpctl/tests/test_sycl_queue_memcpy.py | 76 +++++++++++++++++- 4 files changed, 164 insertions(+), 29 deletions(-) diff --git a/dpctl/_backend.pxd b/dpctl/_backend.pxd index 3f7ba63a55..57da77eb7d 100644 --- a/dpctl/_backend.pxd +++ b/dpctl/_backend.pxd @@ -403,6 +403,13 @@ cdef extern from "syclinterface/dpctl_sycl_queue_interface.h": void *Dest, const void *Src, size_t Count) + cdef DPCTLSyclEventRef DPCTLQueue_MemcpyWithEvents( + const DPCTLSyclQueueRef Q, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *depEvents, + size_t depEventsCount) cdef DPCTLSyclEventRef DPCTLQueue_Memset( const DPCTLSyclQueueRef Q, void *Dest, diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 35062e0ded..0269cc4aae 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -94,7 +94,7 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef void wait(self) cdef DPCTLSyclQueueRef get_queue_ref(self) cpdef memcpy(self, dest, src, size_t count) - cpdef SyclEvent memcpy_async(self, dest, src, size_t count) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=*) cpdef prefetch(self, ptr, size_t count=*) cpdef mem_advise(self, ptr, size_t count, int mem) cpdef SyclEvent submit_barrier(self, dependent_events=*) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 9850da48e4..6496b365f6 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -45,6 +45,7 @@ from ._backend cimport ( # noqa: E211 DPCTLQueue_IsInOrder, DPCTLQueue_MemAdvise, DPCTLQueue_Memcpy, + DPCTLQueue_MemcpyWithEvents, DPCTLQueue_Prefetch, DPCTLQueue_SubmitBarrierForEvents, DPCTLQueue_SubmitNDRange, @@ -64,6 +65,7 @@ import ctypes from .enum_types import backend_type from cpython cimport pycapsule +from cpython.buffer cimport PyObject_CheckBuffer from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject from libc.stdlib cimport free, malloc @@ -160,6 +162,62 @@ cdef void _queue_capsule_deleter(object o) noexcept: DPCTLQueue_Delete(QRef) +cdef bint _is_buffer(object o): + return PyObject_CheckBuffer(o) + + +cdef DPCTLSyclEventRef _memcpy_impl( + SyclQueue q, + object dst, + object src, + size_t byte_count, + DPCTLSyclEventRef *dep_events, + size_t dep_events_count +): + cdef void *c_dst_ptr = NULL + cdef void *c_src_ptr = NULL + cdef DPCTLSyclEventRef ERef = NULL + cdef const unsigned char[::1] src_host_buf = None + cdef unsigned char[::1] dst_host_buf = None + + if isinstance(src, _Memory): + c_src_ptr = (<_Memory>src).memory_ptr + elif _is_buffer(src): + src_host_buf = src + c_src_ptr = &src_host_buf[0] + else: + raise TypeError( + "Parameter `src` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if isinstance(dst, _Memory): + c_dst_ptr = (<_Memory>dst).memory_ptr + elif _is_buffer(dst): + dst_host_buf = dst + c_dst_ptr = &dst_host_buf[0] + else: + raise TypeError( + "Parameter `dst` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if dep_events_count == 0 or dep_events is NULL: + ERef = DPCTLQueue_Memcpy(q._queue_ref, c_dst_ptr, c_src_ptr, byte_count) + else: + ERef = DPCTLQueue_MemcpyWithEvents( + q._queue_ref, + c_dst_ptr, + c_src_ptr, + byte_count, + dep_events, + dep_events_count + ) + return ERef + + cdef class _SyclQueue: """ Barebone data owner class used by SyclQueue. """ @@ -925,21 +983,10 @@ cdef class SyclQueue(_SyclQueue): with nogil: DPCTLQueue_Wait(self._queue_ref) cpdef memcpy(self, dest, src, size_t count): - cdef void *c_dest - cdef void *c_src + """Copy memory from `src` to `dst`""" cdef DPCTLSyclEventRef ERef = NULL - if isinstance(dest, _Memory): - c_dest = (<_Memory>dest).memory_ptr - else: - raise TypeError("Parameter `dest` should have type _Memory.") - - if isinstance(src, _Memory): - c_src = (<_Memory>src).memory_ptr - else: - raise TypeError("Parameter `src` should have type _Memory.") - - ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) if (ERef is NULL): raise RuntimeError( "SyclQueue.memcpy operation encountered an error" @@ -947,22 +994,33 @@ cdef class SyclQueue(_SyclQueue): with nogil: DPCTLEvent_Wait(ERef) DPCTLEvent_Delete(ERef) - cpdef SyclEvent memcpy_async(self, dest, src, size_t count): - cdef void *c_dest - cdef void *c_src + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=None): + """Copy memory from `src` to `dst`""" cdef DPCTLSyclEventRef ERef = NULL + cdef DPCTLSyclEventRef *depEvents = NULL + cdef size_t nDE = 0 - if isinstance(dest, _Memory): - c_dest = (<_Memory>dest).memory_ptr - else: - raise TypeError("Parameter `dest` should have type _Memory.") - - if isinstance(src, _Memory): - c_src = (<_Memory>src).memory_ptr + if dEvents is None: + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) else: - raise TypeError("Parameter `src` should have type _Memory.") + nDE = len(dEvents) + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if depEvents is NULL: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + ERef = _memcpy_impl(self, dest, src, count, depEvents, nDE) + free(depEvents) - ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) if (ERef is NULL): raise RuntimeError( "SyclQueue.memcpy operation encountered an error" diff --git a/dpctl/tests/test_sycl_queue_memcpy.py b/dpctl/tests/test_sycl_queue_memcpy.py index 45c8e41f61..bb3c7b0376 100644 --- a/dpctl/tests/test_sycl_queue_memcpy.py +++ b/dpctl/tests/test_sycl_queue_memcpy.py @@ -44,7 +44,77 @@ def test_memcpy_copy_usm_to_usm(): q.memcpy(mobj2, mobj1, 3) - assert mv2[:3], b"123" + assert mv2[:3] == b"123" + + +def test_memcpy_copy_host_to_usm(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + + canary = bytearray(b"123456789") + host_obj = memoryview(canary) + + q.memcpy(usm_obj, host_obj, len(canary)) + + mv2 = memoryview(usm_obj) + + assert mv2[: len(canary)] == canary + + +def test_memcpy_copy_usm_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + mv2 = memoryview(usm_obj) + + n = 9 + for id in range(n): + mv2[id] = ord("a") + id + + host_obj = bytearray(b" " * n) + + q.memcpy(host_obj, usm_obj, n) + + assert host_obj == b"abcdefghi" + + +def test_memcpy_copy_host_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + dst_buf = bytearray(len(src_buf)) + + q.memcpy(dst_buf, src_buf, len(src_buf)) + + assert dst_buf == src_buf + + +def test_memcpy_async(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + n = len(src_buf) + dst_buf = bytearray(n) + dst_buf2 = bytearray(n) + + e = q.memcpy_async(dst_buf, src_buf, n) + e2 = q.memcpy_async(dst_buf2, src_buf, n) + + e2.wait() + e.wait() + assert dst_buf == src_buf + assert dst_buf2 == src_buf def test_memcpy_type_error(): @@ -56,8 +126,8 @@ def test_memcpy_type_error(): with pytest.raises(TypeError) as cm: q.memcpy(None, mobj, 3) - assert "`dest`" in str(cm.value) + assert "_Memory" in str(cm.value) with pytest.raises(TypeError) as cm: q.memcpy(mobj, None, 3) - assert "`src`" in str(cm.value) + assert "_Memory" in str(cm.value) From cf9084db3efdeaa67b41af1c6e35529f46a23b3d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 18 Sep 2023 14:09:46 -0500 Subject: [PATCH 46/83] One of the memcpy_async calls must use events --- dpctl/tests/test_sycl_queue_memcpy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tests/test_sycl_queue_memcpy.py b/dpctl/tests/test_sycl_queue_memcpy.py index bb3c7b0376..e678b73f03 100644 --- a/dpctl/tests/test_sycl_queue_memcpy.py +++ b/dpctl/tests/test_sycl_queue_memcpy.py @@ -109,10 +109,10 @@ def test_memcpy_async(): dst_buf2 = bytearray(n) e = q.memcpy_async(dst_buf, src_buf, n) - e2 = q.memcpy_async(dst_buf2, src_buf, n) + e2 = q.memcpy_async(dst_buf2, src_buf, n, [e]) - e2.wait() e.wait() + e2.wait() assert dst_buf == src_buf assert dst_buf2 == src_buf From 48cb54d2d2d4d7a30002e164b03d08cf5af18d7f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 18 Sep 2023 14:39:37 -0500 Subject: [PATCH 47/83] Make SyclTimer accumulative ``` In [9]: timer = dpctl.SyclTimer() In [10]: with timer(q): ...: y = dpt.linspace(1, 2, num=10**6, sycl_queue=q) ...: In [11]: timer.dt Out[11]: (0.0022024469999450957, 0.002116712) In [12]: with timer(q): ...: x = dpt.linspace(0, 1, num=10**6, sycl_queue=q) ...: In [13]: timer.dt Out[13]: (0.004531950999989931, 0.004239664000000001) ``` --- dpctl/_sycl_timer.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 322272df2d..33c4c2995f 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -67,10 +67,8 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1): self.timer = host_timer self.time_scale = time_scale self.queue = None - self.host_start = None - self.host_finish = None - self.event_start = None - self.event_finish = None + self.host_times = [] + self.bracketing_events = [] def __call__(self, queue=None): if isinstance(queue, SyclQueue): @@ -89,13 +87,17 @@ def __call__(self, queue=None): return self def __enter__(self): - self.event_start = self.queue.submit_barrier() - self.host_start = self.timer() + self._event_start = self.queue.submit_barrier() + self._host_start = self.timer() return self def __exit__(self, *args): - self.event_finish = self.queue.submit_barrier() - self.host_finish = self.timer() + self.host_times.append((self._host_start, self.timer())) + self.bracketing_events.append( + (self._event_start, self.queue.submit_barrier()) + ) + del self._event_start + del self._host_start @property def dt(self): @@ -103,13 +105,12 @@ def dt(self): element is the duration as measured by the host timer, while the second element is the duration as measured by the device timer and encoded in profiling events""" - self.event_start.wait() - self.event_finish.wait() - return ( - (self.host_finish - self.host_start) * self.time_scale, - ( - self.event_finish.profiling_info_start - - self.event_start.profiling_info_end - ) - * (1e-9 * self.time_scale), - ) + for es, ef in self.bracketing_events: + es.wait() + ef.wait() + host_dt = sum(tf - ts for ts, tf in self.host_times) * self.time_scale + dev_dt = sum( + ef.profiling_info_start - es.profiling_info_end + for es, ef in self.bracketing_events + ) * (1e-9 * self.time_scale) + return (host_dt, dev_dt) From 0a737644ea2d180b0fe86a034249802d816e032a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 21 Sep 2023 19:32:27 -0500 Subject: [PATCH 48/83] Make cdef function except * to channel Python exceptions --- dpctl/_sycl_queue.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 6496b365f6..a27e6f940f 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -173,7 +173,7 @@ cdef DPCTLSyclEventRef _memcpy_impl( size_t byte_count, DPCTLSyclEventRef *dep_events, size_t dep_events_count -): +) except *: cdef void *c_dst_ptr = NULL cdef void *c_src_ptr = NULL cdef DPCTLSyclEventRef ERef = NULL From 10722d4f359a9dbb0e07007b28d5b6b47af50ecd Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 5 Oct 2023 15:51:59 -0500 Subject: [PATCH 49/83] SyclTimer.dt return object with named accessors The object can unpack into a tuple, like before, but it prints with annotation of what each number means, and provides names getters. with timer(q): code dur = timer.dt print(dur) # outputs (host_dt=..., device_dt=...) dur.host_dt # get host-timer delta dur.device_dt # get device-timer delta hdt, ddt = dur # unpack into a tuple --- dpctl/_sycl_timer.py | 37 +++++++++++++++++++++++++++------- dpctl/tests/test_sycl_event.py | 7 ++++++- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 33c4c2995f..66dd4f9340 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import timeit from . import SyclQueue @@ -22,6 +21,29 @@ __doc__ = "This module implements :class:`dpctl.SyclTimer`." +class HostDeviceDuration: + def __init__(self, host_dt, device_dt): + self._host_dt = host_dt + self._device_dt = device_dt + + def __repr__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __str__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __iter__(self): + yield from [self._host_dt, self._device_dt] + + @property + def host_dt(self): + return self._host_dt + + @property + def device_dt(self): + return self._device_dt + + class SyclTimer: """ SyclTimer(host_timer=timeit.default_timer, time_scale=1) @@ -45,7 +67,7 @@ class SyclTimer: code_block # retrieve elapsed times in milliseconds - sycl_dt, wall_dt = timer.dt + wall_dt, device_dt = timer.dt Remark: The timer submits barriers to the queue at the entrance and the @@ -101,10 +123,11 @@ def __exit__(self, *args): @property def dt(self): - """Returns a tuple of elapsed times where first - element is the duration as measured by the host timer, - while the second element is the duration as measured by - the device timer and encoded in profiling events""" + """Returns a pair of elapsed times (host_dt, device_dt). + + The host_dt is the duration as measured by the host + timer, while the device_dt is the duration as measured by + the device timer and encoded in profiling events.""" for es, ef in self.bracketing_events: es.wait() ef.wait() @@ -113,4 +136,4 @@ def dt(self): ef.profiling_info_start - es.profiling_info_end for es, ef in self.bracketing_events ) * (1e-9 * self.time_scale) - return (host_dt, dev_dt) + return HostDeviceDuration(host_dt, dev_dt) diff --git a/dpctl/tests/test_sycl_event.py b/dpctl/tests/test_sycl_event.py index fa496d1bb8..7f0db07539 100644 --- a/dpctl/tests/test_sycl_event.py +++ b/dpctl/tests/test_sycl_event.py @@ -202,7 +202,12 @@ def test_sycl_timer(): m1.copy_from_device(m2) # host operation [x**2 for x in range(128 * 1024)] - host_dt, device_dt = timer.dt + elapsed = timer.dt + host_dt, device_dt = elapsed + assert isinstance(repr(elapsed), str) + assert isinstance(str(elapsed), str) + assert host_dt == elapsed.host_dt + assert device_dt == elapsed.device_dt assert host_dt > device_dt or (host_dt > 0 and device_dt >= 0) q_no_profiling = dpctl.SyclQueue() assert q_no_profiling.has_enable_profiling is False From db331d4d2a4a062e78bb33f146fb710effbae7e7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 22 Oct 2023 04:21:19 -0500 Subject: [PATCH 50/83] Removed superfluous barrier, moved comment, formatted kernel --- .../include/kernels/accumulators.hpp | 76 ++++++++++--------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 110010706c..a8ef1c423e 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -125,54 +125,56 @@ sycl::event inclusive_scan_rec(sycl::queue &exec_q, auto lws = sycl::range<1>(wg_size); auto gws = sycl::range<1>(n_groups * wg_size); + auto ndRange = sycl::nd_range<1>(gws, lws); + slmT slm_iscan_tmp(lws, cgh); - cgh.parallel_for>( - sycl::nd_range<1>(gws, lws), [=, slm_iscan_tmp = std::move(slm_iscan_tmp)](sycl::nd_item<1> it) - { - auto chunk_gid = it.get_global_id(0); - auto lid = it.get_local_id(0); + using KernelName = inclusive_scan_rec_local_scan_krn< + inputT, outputT, n_wi, IndexerT, decltype(transformer)>; + + cgh.parallel_for(ndRange, [=, slm_iscan_tmp = std::move( + slm_iscan_tmp)]( + sycl::nd_item<1> it) { + auto chunk_gid = it.get_global_id(0); + auto lid = it.get_local_id(0); - std::array local_isum; + std::array local_isum; - size_t i = chunk_gid * n_wi; - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - constexpr outputT out_zero(0); + size_t i = chunk_gid * n_wi; + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + constexpr outputT out_zero(0); - local_isum[m_wi] = - (i + m_wi < n_elems) - ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) - : out_zero; - } + local_isum[m_wi] = + (i + m_wi < n_elems) + ? transformer(input[indexer(s0 + s1 * (i + m_wi))]) + : out_zero; + } -// local_isum is now result of -// inclusive scan of locally stored mask indicators #pragma unroll - for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += local_isum[m_wi - 1]; - } + for (size_t m_wi = 1; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += local_isum[m_wi - 1]; + } + // local_isum is now result of + // inclusive scan of locally stored inputs - size_t wg_iscan_val = - sycl::inclusive_scan_over_group(it.get_group(), - local_isum.back(), - sycl::plus(), - size_t(0)); + size_t wg_iscan_val = sycl::inclusive_scan_over_group( + it.get_group(), local_isum.back(), sycl::plus(), + size_t(0)); - slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; - it.barrier(sycl::access::fence_space::local_space); - size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; - it.barrier(sycl::access::fence_space::local_space); + slm_iscan_tmp[(lid + 1) % wg_size] = wg_iscan_val; + it.barrier(sycl::access::fence_space::local_space); + size_t addand = (lid == 0) ? 0 : slm_iscan_tmp[lid]; #pragma unroll - for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { - local_isum[m_wi] += addand; - } - - for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) { - output[i + m_wi] = local_isum[m_wi]; - } - }); + for (size_t m_wi = 0; m_wi < n_wi; ++m_wi) { + local_isum[m_wi] += addand; + } + + for (size_t m_wi = 0; m_wi < n_wi && i + m_wi < n_elems; ++m_wi) + { + output[i + m_wi] = local_isum[m_wi]; + } + }); }); sycl::event out_event = inc_scan_phase1_ev; From 4ac53fd0725d683e8b4935e2609ed20c0b5b55a1 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 22 Oct 2023 20:24:41 -0500 Subject: [PATCH 51/83] Use partition_type_property descriptor in DPCTLDevice_GetParentDevice This allows to test that the device is an unpartition device without raising and handling the C++ exception. If info::device::partition_type_property is info::partition_property::no_partition then retrieving the parent_device descriptor will throw. --- .../source/dpctl_sycl_device_interface.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp index b5a97013c2..7a159a331c 100644 --- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp @@ -543,6 +543,18 @@ DPCTLDevice_GetParentDevice(__dpctl_keep const DPCTLSyclDeviceRef DRef) { auto D = unwrap(DRef); if (D) { + bool is_unpartitioned = false; + try { + auto pp = + D->get_info(); + is_unpartitioned = + (pp == sycl::info::partition_property::no_partition); + } catch (std::exception const &e) { + error_handler(e, __FILE__, __func__, __LINE__); + return nullptr; + } + if (is_unpartitioned) + return nullptr; try { const auto &parent_D = D->get_info(); return wrap(new device(parent_D)); From 442e46f4d16529d7d188ab9264deaf2611598e16 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Wed, 25 Oct 2023 09:42:02 -0700 Subject: [PATCH 52/83] Implement kernels for in-place ``pow``, ``remainder``, and bitwise operators (#1447) * Implements dedicated __ipow__ kernel * Implements in-place remainder * Implements in-place bitwise_and and bitwise_or * Implements in-place bitwise_xor * Implements in-place bitwise_left_shift and bitwise_right_shift * Adds tests for in-place bitwise elementwise funcs * Added tests for in-place remainder and pow Fixed in-place remainder for devices that do not support 64-bit floating point data types * Test commit splitting up elementwise functions * Added missing includes of common_inplace * Split elementwise functions into two more files and added them to the build * Fix more missing includes * Splits elementwise functions into separate source files * Corrected numbers of elementwise functions * Added missing vector include to elementwise function source files Removed utility include * Remove variable name in function declaration * No need to import init functions into namespace, since they are defined in it Removed "using dpctl::tensor::py_internal::init_abs`, since this imports `init_abs` into the current namespace from `dpctl::tensor::py_internal`, but this namespace is the current namespace and so the import is a no-op. Also added brief docstring for the common init module. * Changed use of "static inline" for utility functions Instead, moved common functions into anonymous namespace as inline, which is C++ way of expressing that multiple definitions of the same function may exist in different C++ translation units, which linker unifies. * Moved inline functions into separate translation units Instead of using inline keyword to allow multiple definitions of the same function in different translation units, introduced elementwise_functions_type_utils.cpp that defines these functions and a header file to use in other translatioon units. This should reduce the binary size of the produced object files and simplify the linker's job reducing the link-time. * Added license header for 2 new files --------- Co-authored-by: Oleksandr Pavlyk --- dpctl/tensor/CMakeLists.txt | 83 +- dpctl/tensor/_elementwise_funcs.py | 29 +- .../elementwise_functions/bitwise_and.hpp | 139 + .../elementwise_functions/bitwise_invert.hpp | 2 + .../bitwise_left_shift.hpp | 145 + .../elementwise_functions/bitwise_or.hpp | 139 + .../bitwise_right_shift.hpp | 147 + .../elementwise_functions/bitwise_xor.hpp | 139 + .../elementwise_functions/floor_divide.hpp | 1 + .../kernels/elementwise_functions/pow.hpp | 252 +- .../elementwise_functions/remainder.hpp | 189 + .../elementwise_functions/subtract.hpp | 1 + .../elementwise_functions/true_divide.hpp | 1 + .../source/elementwise_functions.cpp | 5155 ----------------- .../source/elementwise_functions/abs.cpp | 119 + .../source/elementwise_functions/abs.hpp | 42 + .../source/elementwise_functions/acos.cpp | 119 + .../source/elementwise_functions/acos.hpp | 42 + .../source/elementwise_functions/acosh.cpp | 121 + .../source/elementwise_functions/acosh.hpp | 42 + .../source/elementwise_functions/add.cpp | 229 + .../source/elementwise_functions/add.hpp | 42 + .../source/elementwise_functions/asin.cpp | 119 + .../source/elementwise_functions/asin.hpp | 42 + .../source/elementwise_functions/asinh.cpp | 121 + .../source/elementwise_functions/asinh.hpp | 42 + .../source/elementwise_functions/atan.cpp | 119 + .../source/elementwise_functions/atan.hpp | 42 + .../source/elementwise_functions/atan2.cpp | 140 + .../source/elementwise_functions/atan2.hpp | 42 + .../source/elementwise_functions/atanh.cpp | 121 + .../source/elementwise_functions/atanh.hpp | 42 + .../elementwise_functions/bitwise_and.cpp | 190 + .../elementwise_functions/bitwise_and.hpp | 42 + .../elementwise_functions/bitwise_invert.cpp | 123 + .../elementwise_functions/bitwise_invert.hpp | 42 + .../bitwise_left_shift.cpp | 200 + .../bitwise_left_shift.hpp | 42 + .../elementwise_functions/bitwise_or.cpp | 190 + .../elementwise_functions/bitwise_or.hpp | 42 + .../bitwise_right_shift.cpp | 201 + .../bitwise_right_shift.hpp | 42 + .../elementwise_functions/bitwise_xor.cpp | 190 + .../elementwise_functions/bitwise_xor.hpp | 42 + .../source/elementwise_functions/cbrt.cpp | 119 + .../source/elementwise_functions/cbrt.hpp | 42 + .../source/elementwise_functions/ceil.cpp | 119 + .../source/elementwise_functions/ceil.hpp | 44 + .../source/elementwise_functions/conj.cpp | 119 + .../source/elementwise_functions/conj.hpp | 42 + .../source/elementwise_functions/copysign.cpp | 140 + .../source/elementwise_functions/copysign.hpp | 42 + .../source/elementwise_functions/cos.cpp | 119 + .../source/elementwise_functions/cos.hpp | 42 + .../source/elementwise_functions/cosh.cpp | 119 + .../source/elementwise_functions/cosh.hpp | 42 + .../elementwise_common.cpp | 181 + .../elementwise_common.hpp | 42 + .../elementwise_functions.hpp | 26 +- .../elementwise_functions_type_utils.cpp | 95 + .../elementwise_functions_type_utils.hpp | 56 + .../source/elementwise_functions/equal.cpp | 140 + .../source/elementwise_functions/equal.hpp | 42 + .../source/elementwise_functions/exp.cpp | 119 + .../source/elementwise_functions/exp.hpp | 42 + .../source/elementwise_functions/exp2.cpp | 119 + .../source/elementwise_functions/exp2.hpp | 42 + .../source/elementwise_functions/expm1.cpp | 121 + .../source/elementwise_functions/expm1.hpp | 42 + .../source/elementwise_functions/floor.cpp | 121 + .../source/elementwise_functions/floor.hpp | 42 + .../elementwise_functions/floor_divide.cpp | 190 + .../elementwise_functions/floor_divide.hpp | 42 + .../source/elementwise_functions/greater.cpp | 140 + .../source/elementwise_functions/greater.hpp | 42 + .../elementwise_functions/greater_equal.cpp | 141 + .../elementwise_functions/greater_equal.hpp | 42 + .../source/elementwise_functions/hypot.cpp | 140 + .../source/elementwise_functions/hypot.hpp | 42 + .../source/elementwise_functions/imag.cpp | 119 + .../source/elementwise_functions/imag.hpp | 42 + .../source/elementwise_functions/isfinite.cpp | 122 + .../source/elementwise_functions/isfinite.hpp | 42 + .../source/elementwise_functions/isinf.cpp | 121 + .../source/elementwise_functions/isinf.hpp | 42 + .../source/elementwise_functions/isnan.cpp | 121 + .../source/elementwise_functions/isnan.hpp | 42 + .../source/elementwise_functions/less.cpp | 140 + .../source/elementwise_functions/less.hpp | 42 + .../elementwise_functions/less_equal.cpp | 140 + .../elementwise_functions/less_equal.hpp | 42 + .../source/elementwise_functions/log.cpp | 119 + .../source/elementwise_functions/log.hpp | 42 + .../source/elementwise_functions/log10.cpp | 121 + .../source/elementwise_functions/log10.hpp | 42 + .../source/elementwise_functions/log1p.cpp | 121 + .../source/elementwise_functions/log1p.hpp | 42 + .../source/elementwise_functions/log2.cpp | 119 + .../source/elementwise_functions/log2.hpp | 42 + .../elementwise_functions/logaddexp.cpp | 140 + .../elementwise_functions/logaddexp.hpp | 42 + .../elementwise_functions/logical_and.cpp | 140 + .../elementwise_functions/logical_and.hpp | 42 + .../elementwise_functions/logical_not.cpp | 123 + .../elementwise_functions/logical_not.hpp | 42 + .../elementwise_functions/logical_or.cpp | 140 + .../elementwise_functions/logical_or.hpp | 42 + .../elementwise_functions/logical_xor.cpp | 140 + .../elementwise_functions/logical_xor.hpp | 42 + .../source/elementwise_functions/maximum.cpp | 140 + .../source/elementwise_functions/maximum.hpp | 42 + .../source/elementwise_functions/minimum.cpp | 140 + .../source/elementwise_functions/minimum.hpp | 42 + .../source/elementwise_functions/multiply.cpp | 230 + .../source/elementwise_functions/multiply.hpp | 42 + .../source/elementwise_functions/negative.cpp | 122 + .../source/elementwise_functions/negative.hpp | 42 + .../elementwise_functions/not_equal.cpp | 140 + .../elementwise_functions/not_equal.hpp | 42 + .../source/elementwise_functions/positive.cpp | 122 + .../source/elementwise_functions/positive.hpp | 42 + .../source/elementwise_functions/pow.cpp | 189 + .../source/elementwise_functions/pow.hpp | 42 + .../source/elementwise_functions/proj.cpp | 119 + .../source/elementwise_functions/proj.hpp | 42 + .../source/elementwise_functions/real.cpp | 119 + .../source/elementwise_functions/real.hpp | 42 + .../elementwise_functions/remainder.cpp | 190 + .../elementwise_functions/remainder.hpp | 42 + .../source/elementwise_functions/round.cpp | 121 + .../source/elementwise_functions/round.hpp | 42 + .../source/elementwise_functions/rsqrt.cpp | 121 + .../source/elementwise_functions/rsqrt.hpp | 42 + .../source/elementwise_functions/sign.cpp | 119 + .../source/elementwise_functions/sign.hpp | 42 + .../source/elementwise_functions/signbit.cpp | 122 + .../source/elementwise_functions/signbit.hpp | 42 + .../source/elementwise_functions/sin.cpp | 119 + .../source/elementwise_functions/sin.hpp | 42 + .../source/elementwise_functions/sinh.cpp | 119 + .../source/elementwise_functions/sinh.hpp | 42 + .../source/elementwise_functions/sqrt.cpp | 119 + .../source/elementwise_functions/sqrt.hpp | 42 + .../source/elementwise_functions/square.cpp | 121 + .../source/elementwise_functions/square.hpp | 42 + .../source/elementwise_functions/subtract.cpp | 229 + .../source/elementwise_functions/subtract.hpp | 42 + .../source/elementwise_functions/tan.cpp | 119 + .../source/elementwise_functions/tan.hpp | 42 + .../source/elementwise_functions/tanh.cpp | 119 + .../source/elementwise_functions/tanh.hpp | 42 + .../elementwise_functions/true_divide.cpp | 241 + .../elementwise_functions/true_divide.hpp | 42 + .../source/elementwise_functions/trunc.cpp | 121 + .../source/elementwise_functions/trunc.hpp | 42 + dpctl/tensor/libtensor/source/tensor_py.cpp | 2 +- dpctl/tests/elementwise/test_bitwise_and.py | 56 + .../elementwise/test_bitwise_left_shift.py | 52 + dpctl/tests/elementwise/test_bitwise_or.py | 56 + .../elementwise/test_bitwise_right_shift.py | 52 + dpctl/tests/elementwise/test_bitwise_xor.py | 56 + dpctl/tests/elementwise/test_pow.py | 58 + dpctl/tests/elementwise/test_remainder.py | 52 + 163 files changed, 14358 insertions(+), 5201 deletions(-) delete mode 100644 dpctl/tensor/libtensor/source/elementwise_functions.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/add.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/add.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp rename dpctl/tensor/libtensor/source/{ => elementwise_functions}/elementwise_functions.hpp (97%) create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/real.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/real.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/round.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/round.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/square.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/square.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp create mode 100644 dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index aba009411d..35ca62198f 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -30,6 +30,78 @@ if(WIN32) endif() endif() +set(_elementwise_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acos.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/acosh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/add.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/asinh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atan2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/atanh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_and.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_invert.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_left_shift.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_or.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_right_shift.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/bitwise_xor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cbrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/ceil.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/conj.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/copysign.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cos.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/cosh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/exp2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/expm1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor_divide.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/floor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/greater.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/hypot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/imag.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isfinite.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isinf.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/isnan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/less.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log1p.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/log10.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logaddexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_and.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_not.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_or.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/logical_xor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/maximum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/minimum.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/multiply.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/negative.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/not_equal.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/positive.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/pow.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/proj.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/real.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/remainder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/round.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/rsqrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sign.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/signbit.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sinh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/square.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/subtract.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tan.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/tanh.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp +) set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp @@ -47,10 +119,12 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ) +list(APPEND _tensor_impl_sources + ${_elementwise_sources} +) set(python_module_name _tensor_impl) pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) @@ -63,9 +137,11 @@ endif() set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ) +list(APPEND _no_fast_math_sources + ${_elementwise_sources} +) foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") @@ -76,7 +152,8 @@ foreach(_src_fn ${_no_fast_math_sources}) endforeach() if (UNIX) set_source_files_properties( - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES") endif() target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index 24ae7fa8cf..aa5ba04b19 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -297,6 +297,7 @@ ti._bitwise_and_result_type, ti._bitwise_and, _bitwise_and_docstring_, + binary_inplace_fn=ti._bitwise_and_inplace, ) # B04: ===== BITWISE_LEFT_SHIFT (x1, x2) @@ -330,6 +331,7 @@ ti._bitwise_left_shift_result_type, ti._bitwise_left_shift, _bitwise_left_shift_docstring_, + binary_inplace_fn=ti._bitwise_left_shift_inplace, ) @@ -393,6 +395,7 @@ ti._bitwise_or_result_type, ti._bitwise_or, _bitwise_or_docstring_, + binary_inplace_fn=ti._bitwise_or_inplace, ) # B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) @@ -425,6 +428,7 @@ ti._bitwise_right_shift_result_type, ti._bitwise_right_shift, _bitwise_right_shift_docstring_, + binary_inplace_fn=ti._bitwise_right_shift_inplace, ) @@ -459,6 +463,7 @@ ti._bitwise_xor_result_type, ti._bitwise_xor, _bitwise_xor_docstring_, + binary_inplace_fn=ti._bitwise_xor_inplace, ) @@ -1178,7 +1183,7 @@ _logical_xor_docstring_, ) -# B??: ==== MAXIMUM (x1, x2) +# B26: ==== MAXIMUM (x1, x2) _maximum_docstring_ = """ maximum(x1, x2, out=None, order='K') @@ -1208,7 +1213,7 @@ _maximum_docstring_, ) -# B??: ==== MINIMUM (x1, x2) +# B27: ==== MINIMUM (x1, x2) _minimum_docstring_ = """ minimum(x1, x2, out=None, order='K') @@ -1266,7 +1271,7 @@ ti._multiply_result_type, ti._multiply, _multiply_docstring_, - ti._multiply_inplace, + binary_inplace_fn=ti._multiply_inplace, ) # U25: ==== NEGATIVE (x) @@ -1361,10 +1366,14 @@ the returned array is determined by the Type Promotion Rules. """ pow = BinaryElementwiseFunc( - "pow", ti._pow_result_type, ti._pow, _pow_docstring_ + "pow", + ti._pow_result_type, + ti._pow, + _pow_docstring_, + binary_inplace_fn=ti._pow_inplace, ) -# U??: ==== PROJ (x) +# U40: ==== PROJ (x) _proj_docstring = """ proj(x, out=None, order='K') @@ -1443,7 +1452,11 @@ the returned array is determined by the Type Promotion Rules. """ remainder = BinaryElementwiseFunc( - "remainder", ti._remainder_result_type, ti._remainder, _remainder_docstring_ + "remainder", + ti._remainder_result_type, + ti._remainder, + _remainder_docstring_, + binary_inplace_fn=ti._remainder_inplace, ) # U28: ==== ROUND (x) @@ -1501,7 +1514,7 @@ "sign", ti._sign_result_type, ti._sign, _sign_docstring ) -# ==== SIGNBIT (x) +# U41: ==== SIGNBIT (x) _signbit_docstring = """ signbit(x, out=None, order='K') @@ -1654,7 +1667,7 @@ ti._subtract_result_type, ti._subtract, _subtract_docstring_, - ti._subtract_inplace, + binary_inplace_fn=ti._subtract_inplace, ) diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index 016b3a05d3..d88d17d3e3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -33,6 +33,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -257,6 +258,144 @@ struct BitwiseAndStridedFactory } }; +template struct BitwiseAndInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + using tu_ns::convert_impl; + + if constexpr (std::is_same_v) { + res = res && in; + } + else { + res &= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res && in); + res = vec_cast( + tmp); + } + else { + res &= in; + } + } +}; + +template +using BitwiseAndInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseAndInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseAndInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseAndInplaceFunctor>; + +template +class bitwise_and_inplace_contig_kernel; + +template +sycl::event +bitwise_and_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseAndInplaceContigFunctor, + bitwise_and_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct BitwiseAndInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseAndOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_and_inplace_strided_kernel; + +template +sycl::event bitwise_and_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseAndInplaceStridedFunctor, + bitwise_and_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseAndInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseAndOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_and_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_and } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index 9ce56be966..ed4aeeb59e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -35,6 +35,8 @@ #include "utils/type_utils.hpp" #include +#include "kernels/elementwise_functions/common.hpp" + namespace dpctl { namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index 4ae04f97de..5cfd6ca5e3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -268,6 +269,150 @@ struct BitwiseLeftShiftStridedFactory } }; +template struct BitwiseLeftShiftInplaceFunctor +{ + static_assert(std::is_integral_v); + static_assert(!std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + impl(res, in); + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + impl(res[i], in[i]); + } + } + +private: + void impl(resT &res, const argT &in) const + { + constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + (in < res_bitsize) ? (res <<= in) : res = zero; + } + else { + (in < argT(0)) ? res = zero + : ((in < res_bitsize) ? (res <<= in) : res = zero); + } + } +}; + +template +using BitwiseLeftShiftInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseLeftShiftInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseLeftShiftInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseLeftShiftInplaceFunctor>; + +template +class bitwise_left_shift_inplace_contig_kernel; + +template +sycl::event bitwise_left_shift_inplace_contig_impl( + sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseLeftShiftInplaceContigFunctor, + bitwise_left_shift_inplace_contig_kernel>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseLeftShiftInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_left_shift_inplace_strided_kernel; + +template +sycl::event bitwise_left_shift_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseLeftShiftInplaceStridedFunctor, + bitwise_left_shift_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseLeftShiftInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_left_shift_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_left_shift } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index 65f25dd296..d5669d41b1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -33,6 +33,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -253,6 +254,144 @@ template struct BitwiseOrStridedFactory } }; +template struct BitwiseOrInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + using tu_ns::convert_impl; + + if constexpr (std::is_same_v) { + res = res || in; + } + else { + res |= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res || in); + res = vec_cast( + tmp); + } + else { + res |= in; + } + } +}; + +template +using BitwiseOrInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseOrInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseOrInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseOrInplaceFunctor>; + +template +class bitwise_or_inplace_contig_kernel; + +template +sycl::event +bitwise_or_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseOrInplaceContigFunctor, + bitwise_or_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct BitwiseOrInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseOrOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_or_inplace_strided_kernel; + +template +sycl::event bitwise_or_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseOrInplaceStridedFunctor, + bitwise_or_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseOrInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseOrOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_or_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_or } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index 9442d4f6b7..5a04165701 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -270,6 +271,152 @@ struct BitwiseRightShiftStridedFactory } }; +template struct BitwiseRightShiftInplaceFunctor +{ + static_assert(std::is_integral_v); + static_assert(!std::is_same_v); + + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + impl(res, in); + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + impl(res[i], in[i]); + } + } + +private: + void impl(resT &res, const argT &in) const + { + constexpr argT res_bitsize = static_cast(sizeof(resT) * 8); + constexpr resT zero = resT(0); + + // bitshift op with second operand negative, or >= bitwidth(argT1) is UB + // array API spec mandates 0 + if constexpr (std::is_unsigned_v) { + (in < res_bitsize) ? (res >>= in) : res = zero; + } + else { + (in < argT(0)) ? res = zero + : ((in < res_bitsize) ? (res >>= in) + : (res < resT(0)) ? res = resT(-1) + : res = zero); + } + } +}; + +template +using BitwiseRightShiftInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseRightShiftInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseRightShiftInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseRightShiftInplaceFunctor>; + +template +class bitwise_right_shift_inplace_contig_kernel; + +template +sycl::event bitwise_right_shift_inplace_contig_impl( + sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseRightShiftInplaceContigFunctor, + bitwise_right_shift_inplace_contig_kernel>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template +struct BitwiseRightShiftInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_right_shift_inplace_strided_kernel; + +template +sycl::event bitwise_right_shift_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseRightShiftInplaceStridedFunctor, + bitwise_right_shift_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseRightShiftInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_right_shift_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_right_shift } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index 2b0ab09dca..ec8192fd0f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -33,6 +33,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -257,6 +258,144 @@ struct BitwiseXorStridedFactory } }; +template struct BitwiseXorInplaceFunctor +{ + using supports_sg_loadstore = typename std::true_type; + using supports_vec = typename std::true_type; + + void operator()(resT &res, const argT &in) const + { + using tu_ns::convert_impl; + + if constexpr (std::is_same_v) { + res = (res != in); + } + else { + res ^= in; + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) const + { + + if constexpr (std::is_same_v) { + using dpctl::tensor::type_utils::vec_cast; + + auto tmp = (res != in); + res = vec_cast( + tmp); + } + else { + res ^= in; + } + } +}; + +template +using BitwiseXorInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + BitwiseXorInplaceFunctor, + vec_sz, + n_vecs>; + +template +using BitwiseXorInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + BitwiseXorInplaceFunctor>; + +template +class bitwise_xor_inplace_contig_kernel; + +template +sycl::event +bitwise_xor_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, BitwiseXorInplaceContigFunctor, + bitwise_xor_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct BitwiseXorInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseXorOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_inplace_contig_impl; + return fn; + } + } +}; + +template +class bitwise_xor_inplace_strided_kernel; + +template +sycl::event bitwise_xor_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, BitwiseXorInplaceStridedFunctor, + bitwise_xor_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct BitwiseXorInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename BitwiseXorOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = bitwise_xor_inplace_strided_impl; + return fn; + } + } +}; + } // namespace bitwise_xor } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index 025d7e8bc4..241c0e7ca8 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index ba9241b8db..6654bae384 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -35,6 +35,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -55,31 +56,30 @@ template struct PowFunctor using supports_sg_loadstore = std::negation< std::disjunction, tu_ns::is_complex>>; - using supports_vec = - std::negation, - tu_ns::is_complex, - std::is_integral, - std::is_integral>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; - resT operator()(argT1 in1, argT2 in2) const + resT operator()(const argT1 &in1, const argT2 &in2) const { if constexpr (std::is_integral_v || std::is_integral_v) { + auto tmp1 = in1; + auto tmp2 = in2; if constexpr (std::is_signed_v) { - if (in2 < 0) { + if (tmp2 < 0) { // invalid; return 0 return resT(0); } } resT res = 1; - if (in1 == 1 || in2 == 0) { + if (tmp1 == 1 || tmp2 == 0) { return res; } - while (in2 > 0) { - if (in2 & 1) { - res *= in1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res *= tmp1; } - in2 >>= 1; - in1 *= in1; + tmp2 >>= 1; + tmp1 *= tmp1; } return res; } @@ -93,16 +93,48 @@ template struct PowFunctor operator()(const sycl::vec &in1, const sycl::vec &in2) const { - auto res = sycl::pow(in1, in2); - if constexpr (std::is_same_v) { + if constexpr (std::is_integral_v || std::is_integral_v) { + sycl::vec res; +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + auto tmp1 = in1[i]; + auto tmp2 = in2[i]; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; yield 0 + res[i] = 0; + continue; + } + } + resT res_tmp = 1; + if (tmp1 == 1 || tmp2 == 0) { + res[i] = res_tmp; + continue; + } + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res[i] = res_tmp; + } return res; } else { - using dpctl::tensor::type_utils::vec_cast; + auto res = sycl::pow(in1, in2); + if constexpr (std::is_same_v) + { + return res; + } + else { + using dpctl::tensor::type_utils::vec_cast; - return vec_cast( - res); + return vec_cast(res); + } } } }; @@ -128,10 +160,6 @@ using PowStridedFunctor = IndexerT, PowFunctor>; -// TODO: when type promotion logic is better defined, -// consider implementing overloads of std::pow that take -// integers for the exponents. Seem to give better accuracy in -// some cases (complex data especially) template struct PowOutputType { using value_type = typename std::disjunction< // disjunction is C++17 @@ -286,6 +314,184 @@ template struct PowStridedFactory } }; +template struct PowInplaceFunctor +{ + + using supports_sg_loadstore = std::negation< + std::disjunction, tu_ns::is_complex>>; + using supports_vec = std::negation< + std::disjunction, tu_ns::is_complex>>; + + void operator()(resT &res, const argT &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { + auto tmp1 = res; + auto tmp2 = in; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; return 0 + res = 0; + return; + } + } + if (tmp1 == 1) { + return; + } + if (tmp2 == 0) { + res = 1; + return; + } + resT res_tmp = 1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res = res_tmp; + return; + } + else { + res = std::pow(res, in); + }; + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { +#pragma unroll + for (int i = 0; i < vec_sz; ++i) { + auto tmp1 = res[i]; + auto tmp2 = in[i]; + if constexpr (std::is_signed_v) { + if (tmp2 < 0) { + // invalid; return 0 + res[i] = 0; + continue; + } + } + if (tmp1 == 1) { + continue; + } + if (tmp2 == 0) { + res[i] = 1; + continue; + } + resT res_tmp = 1; + while (tmp2 > 0) { + if (tmp2 & 1) { + res_tmp *= tmp1; + } + tmp2 >>= 1; + tmp1 *= tmp1; + } + res[i] = res_tmp; + } + } + else { + res = sycl::pow(res, in); + } + } +}; + +template +using PowInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + PowInplaceFunctor, + vec_sz, + n_vecs>; + +template +using PowInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + PowInplaceFunctor>; + +template +class pow_inplace_contig_kernel; + +template +sycl::event +pow_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, PowInplaceContigFunctor, pow_inplace_contig_kernel>( + exec_q, nelems, arg_p, arg_offset, res_p, res_offset, depends); +} + +template struct PowInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_inplace_contig_impl; + return fn; + } + } +}; + +template +class pow_inplace_strided_kernel; + +template +sycl::event +pow_inplace_strided_impl(sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, PowInplaceStridedFunctor, pow_inplace_strided_kernel>( + exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p, + res_offset, depends, additional_depends); +} + +template +struct PowInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v::value_type, + void>) { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = pow_inplace_strided_impl; + return fn; + } + } +}; + } // namespace pow } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index 6cd306a900..051a1f9029 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -35,6 +35,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl @@ -313,6 +314,194 @@ template struct RemainderStridedFactory } }; +template struct RemainderInplaceFunctor +{ + + using supports_sg_loadstore = std::true_type; + using supports_vec = std::true_type; + + // functor is only well-defined when argT and resT are the same + static_assert(std::is_same_v); + + void operator()(resT &res, const argT &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { + if (in == argT(0)) { + res = 0; + return; + } + if constexpr (std::is_signed_v || std::is_signed_v) { + auto tmp = res; + res %= in; + if (res != resT(0) && l_xor(tmp < 0, in < 0)) { + res += in; + } + } + else { + res %= in; + } + } + else { + res = sycl::fmod(res, in); + if (res) { + if (l_xor(in < 0, res < 0)) { + res += in; + } + } + else { + res = sycl::copysign(resT(0), in); + } + } + } + + template + void operator()(sycl::vec &res, + const sycl::vec &in) + { + if constexpr (std::is_integral_v || std::is_integral_v) { +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (in[i] == argT(0)) { + res[i] = 0; + } + else { + auto rem = res[i] % in[i]; + if constexpr (std::is_signed_v || + std::is_signed_v) { + if (rem != 0 && l_xor(res[i] < 0, in[i] < 0)) { + rem += in[i]; + } + } + res[i] = rem; + } + } + } + else { + res = sycl::fmod(res, in); +#pragma unroll + for (auto i = 0; i < vec_sz; ++i) { + if (res[i]) { + if (l_xor(in[i] < 0, res[i] < 0)) { + res[i] += in[i]; + } + } + else { + res[i] = sycl::copysign(resT(0), in[i]); + } + } + } + } + +private: + bool l_xor(bool b1, bool b2) const + { + return (b1 != b2); + } +}; + +template +using RemainderInplaceContigFunctor = + elementwise_common::BinaryInplaceContigFunctor< + argT, + resT, + RemainderInplaceFunctor, + vec_sz, + n_vecs>; + +template +using RemainderInplaceStridedFunctor = + elementwise_common::BinaryInplaceStridedFunctor< + argT, + resT, + IndexerT, + RemainderInplaceFunctor>; + +template +class remainder_inplace_contig_kernel; + +template +sycl::event +remainder_inplace_contig_impl(sycl::queue &exec_q, + size_t nelems, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends = {}) +{ + return elementwise_common::binary_inplace_contig_impl< + argTy, resTy, RemainderInplaceContigFunctor, + remainder_inplace_contig_kernel>(exec_q, nelems, arg_p, arg_offset, + res_p, res_offset, depends); +} + +template +struct RemainderInplaceContigFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename RemainderOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_inplace_contig_impl; + return fn; + } + } +}; + +template +class remainder_inplace_strided_kernel; + +template +sycl::event remainder_inplace_strided_impl( + sycl::queue &exec_q, + size_t nelems, + int nd, + const py::ssize_t *shape_and_strides, + const char *arg_p, + py::ssize_t arg_offset, + char *res_p, + py::ssize_t res_offset, + const std::vector &depends, + const std::vector &additional_depends) +{ + return elementwise_common::binary_inplace_strided_impl< + argTy, resTy, RemainderInplaceStridedFunctor, + remainder_inplace_strided_kernel>(exec_q, nelems, nd, shape_and_strides, + arg_p, arg_offset, res_p, res_offset, + depends, additional_depends); +} + +template +struct RemainderInplaceStridedFactory +{ + fnT get() + { + if constexpr (std::is_same_v< + typename RemainderOutputType::value_type, + void>) + { + fnT fn = nullptr; + return fn; + } + else { + fnT fn = remainder_inplace_strided_impl; + return fn; + } + } +}; + } // namespace remainder } // namespace kernels } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index 3eb8420933..e4ae857738 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index 138f7a3f91..86fb0ca2e2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -34,6 +34,7 @@ #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" #include namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp deleted file mode 100644 index 043cac0cd2..0000000000 --- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp +++ /dev/null @@ -1,5155 +0,0 @@ -//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2023 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions, -/// specifically functions for elementwise operations. -//===----------------------------------------------------------------------===// - -#include "dpctl4pybind11.hpp" -#include -#include -#include -#include -#include - -#include "elementwise_functions.hpp" -#include "utils/type_dispatch.hpp" - -#include "kernels/elementwise_functions/abs.hpp" -#include "kernels/elementwise_functions/acos.hpp" -#include "kernels/elementwise_functions/acosh.hpp" -#include "kernels/elementwise_functions/add.hpp" -#include "kernels/elementwise_functions/asin.hpp" -#include "kernels/elementwise_functions/asinh.hpp" -#include "kernels/elementwise_functions/atan.hpp" -#include "kernels/elementwise_functions/atan2.hpp" -#include "kernels/elementwise_functions/atanh.hpp" -#include "kernels/elementwise_functions/bitwise_and.hpp" -#include "kernels/elementwise_functions/bitwise_invert.hpp" -#include "kernels/elementwise_functions/bitwise_left_shift.hpp" -#include "kernels/elementwise_functions/bitwise_or.hpp" -#include "kernels/elementwise_functions/bitwise_right_shift.hpp" -#include "kernels/elementwise_functions/bitwise_xor.hpp" -#include "kernels/elementwise_functions/cbrt.hpp" -#include "kernels/elementwise_functions/ceil.hpp" -#include "kernels/elementwise_functions/conj.hpp" -#include "kernels/elementwise_functions/copysign.hpp" -#include "kernels/elementwise_functions/cos.hpp" -#include "kernels/elementwise_functions/cosh.hpp" -#include "kernels/elementwise_functions/equal.hpp" -#include "kernels/elementwise_functions/exp.hpp" -#include "kernels/elementwise_functions/exp2.hpp" -#include "kernels/elementwise_functions/expm1.hpp" -#include "kernels/elementwise_functions/floor.hpp" -#include "kernels/elementwise_functions/floor_divide.hpp" -#include "kernels/elementwise_functions/greater.hpp" -#include "kernels/elementwise_functions/greater_equal.hpp" -#include "kernels/elementwise_functions/hypot.hpp" -#include "kernels/elementwise_functions/imag.hpp" -#include "kernels/elementwise_functions/isfinite.hpp" -#include "kernels/elementwise_functions/isinf.hpp" -#include "kernels/elementwise_functions/isnan.hpp" -#include "kernels/elementwise_functions/less.hpp" -#include "kernels/elementwise_functions/less_equal.hpp" -#include "kernels/elementwise_functions/log.hpp" -#include "kernels/elementwise_functions/log10.hpp" -#include "kernels/elementwise_functions/log1p.hpp" -#include "kernels/elementwise_functions/log2.hpp" -#include "kernels/elementwise_functions/logaddexp.hpp" -#include "kernels/elementwise_functions/logical_and.hpp" -#include "kernels/elementwise_functions/logical_not.hpp" -#include "kernels/elementwise_functions/logical_or.hpp" -#include "kernels/elementwise_functions/logical_xor.hpp" -#include "kernels/elementwise_functions/maximum.hpp" -#include "kernels/elementwise_functions/minimum.hpp" -#include "kernels/elementwise_functions/multiply.hpp" -#include "kernels/elementwise_functions/negative.hpp" -#include "kernels/elementwise_functions/not_equal.hpp" -#include "kernels/elementwise_functions/positive.hpp" -#include "kernels/elementwise_functions/pow.hpp" -#include "kernels/elementwise_functions/proj.hpp" -#include "kernels/elementwise_functions/real.hpp" -#include "kernels/elementwise_functions/remainder.hpp" -#include "kernels/elementwise_functions/round.hpp" -#include "kernels/elementwise_functions/rsqrt.hpp" -#include "kernels/elementwise_functions/sign.hpp" -#include "kernels/elementwise_functions/signbit.hpp" -#include "kernels/elementwise_functions/sin.hpp" -#include "kernels/elementwise_functions/sinh.hpp" -#include "kernels/elementwise_functions/sqrt.hpp" -#include "kernels/elementwise_functions/square.hpp" -#include "kernels/elementwise_functions/subtract.hpp" -#include "kernels/elementwise_functions/tan.hpp" -#include "kernels/elementwise_functions/tanh.hpp" -#include "kernels/elementwise_functions/true_divide.hpp" -#include "kernels/elementwise_functions/trunc.hpp" - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -namespace td_ns = dpctl::tensor::type_dispatch; - -py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t) -{ - switch (dst_typenum_t) { - case td_ns::typenum_t::BOOL: - return py::dtype("?"); - case td_ns::typenum_t::INT8: - return py::dtype("i1"); - case td_ns::typenum_t::UINT8: - return py::dtype("u1"); - case td_ns::typenum_t::INT16: - return py::dtype("i2"); - case td_ns::typenum_t::UINT16: - return py::dtype("u2"); - case td_ns::typenum_t::INT32: - return py::dtype("i4"); - case td_ns::typenum_t::UINT32: - return py::dtype("u4"); - case td_ns::typenum_t::INT64: - return py::dtype("i8"); - case td_ns::typenum_t::UINT64: - return py::dtype("u8"); - case td_ns::typenum_t::HALF: - return py::dtype("f2"); - case td_ns::typenum_t::FLOAT: - return py::dtype("f4"); - case td_ns::typenum_t::DOUBLE: - return py::dtype("f8"); - case td_ns::typenum_t::CFLOAT: - return py::dtype("c8"); - case td_ns::typenum_t::CDOUBLE: - return py::dtype("c16"); - default: - throw py::value_error("Unrecognized dst_typeid"); - } -} - -int _result_typeid(int arg_typeid, const int *fn_output_id) -{ - if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) { - throw py::value_error("Input typeid " + std::to_string(arg_typeid) + - " is outside of expected bounds."); - } - - return fn_output_id[arg_typeid]; -} - -namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; -using ew_cmn_ns::binary_contig_impl_fn_ptr_t; -using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; -using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; -using ew_cmn_ns::binary_strided_impl_fn_ptr_t; -using ew_cmn_ns::unary_contig_impl_fn_ptr_t; -using ew_cmn_ns::unary_strided_impl_fn_ptr_t; - -using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; -using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; -using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; - -// U01: ==== ABS (x) -namespace impl -{ - -namespace abs_fn_ns = dpctl::tensor::kernels::abs; - -static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types]; -static int abs_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - abs_strided_dispatch_vector[td_ns::num_types]; - -void populate_abs_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = abs_fn_ns; - - using fn_ns::AbsContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(abs_contig_dispatch_vector); - - using fn_ns::AbsStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(abs_strided_dispatch_vector); - - using fn_ns::AbsTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(abs_output_typeid_vector); -}; - -} // namespace impl - -// U02: ==== ACOS (x) -namespace impl -{ - -namespace acos_fn_ns = dpctl::tensor::kernels::acos; - -static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types]; -static int acos_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - acos_strided_dispatch_vector[td_ns::num_types]; - -void populate_acos_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = acos_fn_ns; - - using fn_ns::AcosContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(acos_contig_dispatch_vector); - - using fn_ns::AcosStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(acos_strided_dispatch_vector); - - using fn_ns::AcosTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(acos_output_typeid_vector); -} - -} // namespace impl - -// U03: ===== ACOSH (x) -namespace impl -{ - -namespace acosh_fn_ns = dpctl::tensor::kernels::acosh; - -static unary_contig_impl_fn_ptr_t - acosh_contig_dispatch_vector[td_ns::num_types]; -static int acosh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - acosh_strided_dispatch_vector[td_ns::num_types]; - -void populate_acosh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = acosh_fn_ns; - - using fn_ns::AcoshContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector); - - using fn_ns::AcoshStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector); - - using fn_ns::AcoshTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(acosh_output_typeid_vector); -} - -} // namespace impl - -// B01: ===== ADD (x1, x2) -namespace impl -{ -namespace add_fn_ns = dpctl::tensor::kernels::add; - -static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int add_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - add_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// add(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -// add(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t - add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_add_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = add_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::AddTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(add_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::AddStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(add_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::AddContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(add_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::AddContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - AddContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - add_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::AddContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - AddContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - add_contig_row_contig_matrix_broadcast_dispatch_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::AddInplaceStridedFactory; - DispatchTableBuilder - dtb6; - dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::AddInplaceContigFactory; - DispatchTableBuilder - dtb7; - dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table); - - // function pointers for inplace operation on contiguous matrix - // and contiguous row - using fn_ns::AddInplaceRowMatrixBroadcastFactory; - DispatchTableBuilder - dtb8; - dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table); -}; - -} // namespace impl - -// U04: ===== ASIN (x) -namespace impl -{ - -namespace asin_fn_ns = dpctl::tensor::kernels::asin; - -static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types]; -static int asin_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - asin_strided_dispatch_vector[td_ns::num_types]; - -void populate_asin_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = asin_fn_ns; - - using fn_ns::AsinContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(asin_contig_dispatch_vector); - - using fn_ns::AsinStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(asin_strided_dispatch_vector); - - using fn_ns::AsinTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(asin_output_typeid_vector); -} - -} // namespace impl - -// U05: ===== ASINH (x) -namespace impl -{ - -namespace asinh_fn_ns = dpctl::tensor::kernels::asinh; - -static unary_contig_impl_fn_ptr_t - asinh_contig_dispatch_vector[td_ns::num_types]; -static int asinh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - asinh_strided_dispatch_vector[td_ns::num_types]; - -void populate_asinh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = asinh_fn_ns; - - using fn_ns::AsinhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector); - - using fn_ns::AsinhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector); - - using fn_ns::AsinhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(asinh_output_typeid_vector); -} - -} // namespace impl - -// U06: ===== ATAN (x) -namespace impl -{ - -namespace atan_fn_ns = dpctl::tensor::kernels::atan; - -static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types]; -static int atan_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - atan_strided_dispatch_vector[td_ns::num_types]; - -void populate_atan_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = atan_fn_ns; - - using fn_ns::AtanContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(atan_contig_dispatch_vector); - - using fn_ns::AtanStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(atan_strided_dispatch_vector); - - using fn_ns::AtanTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(atan_output_typeid_vector); -} - -} // namespace impl - -// B02: ===== ATAN2 (x1, x2) -namespace impl -{ -namespace atan2_fn_ns = dpctl::tensor::kernels::atan2; - -static binary_contig_impl_fn_ptr_t - atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int atan2_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_atan2_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = atan2_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::Atan2TypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(atan2_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::Atan2StridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(atan2_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::Atan2ContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(atan2_contig_dispatch_table); -}; - -} // namespace impl - -// U07: ===== ATANH (x) -namespace impl -{ - -namespace atanh_fn_ns = dpctl::tensor::kernels::atanh; - -static unary_contig_impl_fn_ptr_t - atanh_contig_dispatch_vector[td_ns::num_types]; -static int atanh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - atanh_strided_dispatch_vector[td_ns::num_types]; - -void populate_atanh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = atanh_fn_ns; - - using fn_ns::AtanhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector); - - using fn_ns::AtanhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector); - - using fn_ns::AtanhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(atanh_output_typeid_vector); -} - -} // namespace impl - -// B03: ===== BITWISE_AND (x1, x2) -namespace impl -{ -namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and; - -static binary_contig_impl_fn_ptr_t - bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_bitwise_and_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_and_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseAndTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_and_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseAndStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseAndContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table); -}; - -} // namespace impl - -// B04: ===== BITWISE_LEFT_SHIFT (x1, x2) -namespace impl -{ -namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift; - -static binary_contig_impl_fn_ptr_t - bitwise_left_shift_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int bitwise_left_shift_output_id_table[td_ns::num_types] - [td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_left_shift_strided_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_bitwise_left_shift_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_left_shift_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseLeftShiftTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseLeftShiftStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseLeftShiftContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table); -}; - -} // namespace impl - -// U08: ===== BITWISE_INVERT (x) -namespace impl -{ - -namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert; - -static unary_contig_impl_fn_ptr_t - bitwise_invert_contig_dispatch_vector[td_ns::num_types]; -static int bitwise_invert_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - bitwise_invert_strided_dispatch_vector[td_ns::num_types]; - -void populate_bitwise_invert_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_invert_fn_ns; - - using fn_ns::BitwiseInvertContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector); - - using fn_ns::BitwiseInvertStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector); - - using fn_ns::BitwiseInvertTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector); -}; - -} // namespace impl - -// B05: ===== BITWISE_OR (x1, x2) -namespace impl -{ -namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or; - -static binary_contig_impl_fn_ptr_t - bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_bitwise_or_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_or_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseOrTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_or_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseOrStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseOrContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table); -}; -} // namespace impl - -// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) -namespace impl -{ -namespace bitwise_right_shift_fn_ns = - dpctl::tensor::kernels::bitwise_right_shift; - -static binary_contig_impl_fn_ptr_t - bitwise_right_shift_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int bitwise_right_shift_output_id_table[td_ns::num_types] - [td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_right_shift_strided_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_bitwise_right_shift_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_right_shift_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseRightShiftTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseRightShiftStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseRightShiftContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table); -}; - -} // namespace impl - -// B07: ===== BITWISE_XOR (x1, x2) -namespace impl -{ -namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor; - -static binary_contig_impl_fn_ptr_t - bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_bitwise_xor_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = bitwise_xor_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::BitwiseXorTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(bitwise_xor_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::BitwiseXorStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::BitwiseXorContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table); -}; -} // namespace impl - -// U09: ==== CEIL (x) -namespace impl -{ - -namespace ceil_fn_ns = dpctl::tensor::kernels::ceil; - -static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types]; -static int ceil_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - ceil_strided_dispatch_vector[td_ns::num_types]; - -void populate_ceil_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = ceil_fn_ns; - - using fn_ns::CeilContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector); - - using fn_ns::CeilStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector); - - using fn_ns::CeilTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(ceil_output_typeid_vector); -} - -} // namespace impl - -// U10: ==== CONJ (x) -namespace impl -{ - -namespace conj_fn_ns = dpctl::tensor::kernels::conj; - -static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types]; -static int conj_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - conj_strided_dispatch_vector[td_ns::num_types]; - -void populate_conj_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = conj_fn_ns; - - using fn_ns::ConjContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(conj_contig_dispatch_vector); - - using fn_ns::ConjStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(conj_strided_dispatch_vector); - - using fn_ns::ConjTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(conj_output_typeid_vector); -} -} // namespace impl - -// U11: ==== COS (x) -namespace impl -{ - -namespace cos_fn_ns = dpctl::tensor::kernels::cos; - -static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types]; -static int cos_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - cos_strided_dispatch_vector[td_ns::num_types]; - -void populate_cos_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = cos_fn_ns; - - using fn_ns::CosContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(cos_contig_dispatch_vector); - - using fn_ns::CosStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(cos_strided_dispatch_vector); - - using fn_ns::CosTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(cos_output_typeid_vector); -} - -} // namespace impl - -// U12: ==== COSH (x) -namespace impl -{ - -namespace cosh_fn_ns = dpctl::tensor::kernels::cosh; - -static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types]; -static int cosh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - cosh_strided_dispatch_vector[td_ns::num_types]; - -void populate_cosh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = cosh_fn_ns; - - using fn_ns::CoshContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector); - - using fn_ns::CoshStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector); - - using fn_ns::CoshTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(cosh_output_typeid_vector); -} - -} // namespace impl - -// B08: ==== DIVIDE (x1, x2) -namespace impl -{ -namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide; - -static binary_contig_impl_fn_ptr_t - true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types]; -static int true_divide_inplace_output_id_table[td_ns::num_types] - [td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// divide(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - true_divide_contig_matrix_contig_row_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -// divide(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - true_divide_contig_row_contig_matrix_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - true_divide_inplace_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - true_divide_inplace_strided_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t - true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_true_divide_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = true_divide_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::TrueDivideTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(true_divide_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::TrueDivideStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(true_divide_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::TrueDivideContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(true_divide_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - TrueDivideContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - true_divide_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - TrueDivideContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - true_divide_contig_row_contig_matrix_broadcast_dispatch_table); - - // which input types are supported, and what is the type of the result - using fn_ns::TrueDivideInplaceTypeMapFactory; - DispatchTableBuilder dtb6; - dtb6.populate_dispatch_table(true_divide_inplace_output_id_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::TrueDivideInplaceStridedFactory; - DispatchTableBuilder - dtb7; - dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::TrueDivideInplaceContigFactory; - DispatchTableBuilder - dtb8; - dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table); - - // function pointers for inplace operation on contiguous matrix - // and contiguous row - using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory; - DispatchTableBuilder - dtb9; - dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table); -}; - -} // namespace impl - -// B09: ==== EQUAL (x1, x2) -namespace impl -{ -namespace equal_fn_ns = dpctl::tensor::kernels::equal; - -static binary_contig_impl_fn_ptr_t - equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::EqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::EqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::EqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(equal_contig_dispatch_table); -}; -} // namespace impl - -// U13: ==== EXP (x) -namespace impl -{ - -namespace exp_fn_ns = dpctl::tensor::kernels::exp; - -static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types]; -static int exp_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - exp_strided_dispatch_vector[td_ns::num_types]; - -void populate_exp_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = exp_fn_ns; - - using fn_ns::ExpContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(exp_contig_dispatch_vector); - - using fn_ns::ExpStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(exp_strided_dispatch_vector); - - using fn_ns::ExpTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(exp_output_typeid_vector); -} - -} // namespace impl - -// U14: ==== EXPM1 (x) -namespace impl -{ - -namespace expm1_fn_ns = dpctl::tensor::kernels::expm1; - -static unary_contig_impl_fn_ptr_t - expm1_contig_dispatch_vector[td_ns::num_types]; -static int expm1_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - expm1_strided_dispatch_vector[td_ns::num_types]; - -void populate_expm1_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = expm1_fn_ns; - - using fn_ns::Expm1ContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector); - - using fn_ns::Expm1StridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector); - - using fn_ns::Expm1TypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(expm1_output_typeid_vector); -} - -} // namespace impl - -// U15: ==== FLOOR (x) -namespace impl -{ - -namespace floor_fn_ns = dpctl::tensor::kernels::floor; - -static unary_contig_impl_fn_ptr_t - floor_contig_dispatch_vector[td_ns::num_types]; -static int floor_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - floor_strided_dispatch_vector[td_ns::num_types]; - -void populate_floor_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = floor_fn_ns; - - using fn_ns::FloorContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(floor_contig_dispatch_vector); - - using fn_ns::FloorStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(floor_strided_dispatch_vector); - - using fn_ns::FloorTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(floor_output_typeid_vector); -} - -} // namespace impl - -// B10: ==== FLOOR_DIVIDE (x1, x2) -namespace impl -{ -namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide; - -static binary_contig_impl_fn_ptr_t - floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - floor_divide_inplace_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - floor_divide_inplace_strided_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_floor_divide_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = floor_divide_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::FloorDivideTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(floor_divide_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::FloorDivideStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::FloorDivideContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::FloorDivideInplaceStridedFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::FloorDivideInplaceContigFactory; - DispatchTableBuilder - dtb5; - dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table); -}; - -} // namespace impl - -// B11: ==== GREATER (x1, x2) -namespace impl -{ -namespace greater_fn_ns = dpctl::tensor::kernels::greater; - -static binary_contig_impl_fn_ptr_t - greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int greater_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_greater_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = greater_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::GreaterTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(greater_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::GreaterStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(greater_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::GreaterContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(greater_contig_dispatch_table); -}; -} // namespace impl - -// B12: ==== GREATER_EQUAL (x1, x2) -namespace impl -{ -namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal; - -static binary_contig_impl_fn_ptr_t - greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_greater_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = greater_equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::GreaterEqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(greater_equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::GreaterEqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::GreaterEqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table); -}; -} // namespace impl - -// U16: ==== IMAG (x) -namespace impl -{ - -namespace imag_fn_ns = dpctl::tensor::kernels::imag; - -static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types]; -static int imag_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - imag_strided_dispatch_vector[td_ns::num_types]; - -void populate_imag_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = imag_fn_ns; - - using fn_ns::ImagContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(imag_contig_dispatch_vector); - - using fn_ns::ImagStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(imag_strided_dispatch_vector); - - using fn_ns::ImagTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(imag_output_typeid_vector); -} -} // namespace impl - -// U17: ==== ISFINITE (x) -namespace impl -{ -namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite; - -static unary_contig_impl_fn_ptr_t - isfinite_contig_dispatch_vector[td_ns::num_types]; -static int isfinite_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - isfinite_strided_dispatch_vector[td_ns::num_types]; - -void populate_isfinite_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = isfinite_fn_ns; - - using fn_ns::IsFiniteContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector); - - using fn_ns::IsFiniteStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector); - - using fn_ns::IsFiniteTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(isfinite_output_typeid_vector); -} - -} // namespace impl - -// U18: ==== ISINF (x) -namespace impl -{ -namespace isinf_fn_ns = dpctl::tensor::kernels::isinf; - -static unary_contig_impl_fn_ptr_t - isinf_contig_dispatch_vector[td_ns::num_types]; -static int isinf_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - isinf_strided_dispatch_vector[td_ns::num_types]; - -void populate_isinf_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = isinf_fn_ns; - - using fn_ns::IsInfContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector); - - using fn_ns::IsInfStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector); - - using fn_ns::IsInfTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(isinf_output_typeid_vector); -} - -} // namespace impl - -// U19: ==== ISNAN (x) -namespace impl -{ -namespace isnan_fn_ns = dpctl::tensor::kernels::isnan; - -static unary_contig_impl_fn_ptr_t - isnan_contig_dispatch_vector[td_ns::num_types]; -static int isnan_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - isnan_strided_dispatch_vector[td_ns::num_types]; - -void populate_isnan_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = isnan_fn_ns; - - using fn_ns::IsNanContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector); - - using fn_ns::IsNanStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector); - - using fn_ns::IsNanTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(isnan_output_typeid_vector); -} - -} // namespace impl - -// B13: ==== LESS (x1, x2) -namespace impl -{ -namespace less_fn_ns = dpctl::tensor::kernels::less; - -static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int less_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - less_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_less_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = less_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LessTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(less_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LessStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(less_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LessContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(less_contig_dispatch_table); -}; -} // namespace impl - -// B14: ==== LESS_EQUAL (x1, x2) -namespace impl -{ -namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal; - -static binary_contig_impl_fn_ptr_t - less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_less_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = less_equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LessEqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(less_equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LessEqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(less_equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LessEqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(less_equal_contig_dispatch_table); -}; -} // namespace impl - -// U20: ==== LOG (x) -namespace impl -{ - -namespace log_fn_ns = dpctl::tensor::kernels::log; - -static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types]; -static int log_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log_strided_dispatch_vector[td_ns::num_types]; - -void populate_log_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log_fn_ns; - - using fn_ns::LogContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log_contig_dispatch_vector); - - using fn_ns::LogStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log_strided_dispatch_vector); - - using fn_ns::LogTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log_output_typeid_vector); -} - -} // namespace impl - -// U21: ==== LOG1P (x) -namespace impl -{ - -namespace log1p_fn_ns = dpctl::tensor::kernels::log1p; - -static unary_contig_impl_fn_ptr_t - log1p_contig_dispatch_vector[td_ns::num_types]; -static int log1p_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log1p_strided_dispatch_vector[td_ns::num_types]; - -void populate_log1p_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log1p_fn_ns; - - using fn_ns::Log1pContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector); - - using fn_ns::Log1pStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector); - - using fn_ns::Log1pTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log1p_output_typeid_vector); -} - -} // namespace impl - -// U22: ==== LOG2 (x) -namespace impl -{ - -namespace log2_fn_ns = dpctl::tensor::kernels::log2; - -static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types]; -static int log2_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log2_strided_dispatch_vector[td_ns::num_types]; - -void populate_log2_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log2_fn_ns; - - using fn_ns::Log2ContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log2_contig_dispatch_vector); - - using fn_ns::Log2StridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log2_strided_dispatch_vector); - - using fn_ns::Log2TypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log2_output_typeid_vector); -}; - -} // namespace impl - -// U23: ==== LOG10 (x) -namespace impl -{ - -namespace log10_fn_ns = dpctl::tensor::kernels::log10; - -static unary_contig_impl_fn_ptr_t - log10_contig_dispatch_vector[td_ns::num_types]; -static int log10_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - log10_strided_dispatch_vector[td_ns::num_types]; - -void populate_log10_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = log10_fn_ns; - - using fn_ns::Log10ContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(log10_contig_dispatch_vector); - - using fn_ns::Log10StridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(log10_strided_dispatch_vector); - - using fn_ns::Log10TypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(log10_output_typeid_vector); -}; - -} // namespace impl - -// B15: ==== LOGADDEXP (x1, x2) -namespace impl -{ -namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp; - -static binary_contig_impl_fn_ptr_t - logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logaddexp_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logaddexp_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogAddExpTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logaddexp_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogAddExpStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogAddExpContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table); -}; -} // namespace impl - -// B16: ==== LOGICAL_AND (x1, x2) -namespace impl -{ -namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and; - -static binary_contig_impl_fn_ptr_t - logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logical_and_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logical_and_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogicalAndTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logical_and_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogicalAndStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logical_and_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogicalAndContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logical_and_contig_dispatch_table); -}; -} // namespace impl - -// U24: ==== LOGICAL_NOT (x) -namespace impl -{ -namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not; - -static unary_contig_impl_fn_ptr_t - logical_not_contig_dispatch_vector[td_ns::num_types]; -static int logical_not_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - logical_not_strided_dispatch_vector[td_ns::num_types]; - -void populate_logical_not_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = logical_not_fn_ns; - - using fn_ns::LogicalNotContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector); - - using fn_ns::LogicalNotStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector); - - using fn_ns::LogicalNotTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(logical_not_output_typeid_vector); -}; -} // namespace impl - -// B17: ==== LOGICAL_OR (x1, x2) -namespace impl -{ -namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or; - -static binary_contig_impl_fn_ptr_t - logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logical_or_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logical_or_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogicalOrTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logical_or_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogicalOrStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logical_or_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogicalOrContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logical_or_contig_dispatch_table); -}; -} // namespace impl - -// B18: ==== LOGICAL_XOR (x1, x2) -namespace impl -{ -namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor; - -static binary_contig_impl_fn_ptr_t - logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_logical_xor_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = logical_xor_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::LogicalXorTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(logical_xor_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::LogicalXorStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::LogicalXorContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table); -}; -} // namespace impl - -// B??: ==== MAXIMUM (x1, x2) -namespace impl -{ - -namespace maximum_fn_ns = dpctl::tensor::kernels::maximum; - -static binary_contig_impl_fn_ptr_t - maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int maximum_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_maximum_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = maximum_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::MaximumTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(maximum_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::MaximumStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(maximum_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::MaximumContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(maximum_contig_dispatch_table); -}; - -} // namespace impl - -// B??: ==== MINIMUM (x1, x2) -namespace impl -{ - -namespace minimum_fn_ns = dpctl::tensor::kernels::minimum; - -static binary_contig_impl_fn_ptr_t - minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int minimum_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_minimum_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = minimum_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::MinimumTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(minimum_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::MinimumStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(minimum_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::MinimumContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(minimum_contig_dispatch_table); -}; - -} // namespace impl - -// B19: ==== MULTIPLY (x1, x2) -namespace impl -{ - -namespace multiply_fn_ns = dpctl::tensor::kernels::multiply; - -static binary_contig_impl_fn_ptr_t - multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int multiply_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// mul(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - multiply_contig_matrix_contig_row_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -// mul(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - multiply_contig_row_contig_matrix_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t - multiply_inplace_row_matrix_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_multiply_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = multiply_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::MultiplyTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(multiply_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::MultiplyStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(multiply_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::MultiplyContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(multiply_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - MultiplyContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - multiply_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - MultiplyContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - multiply_contig_row_contig_matrix_broadcast_dispatch_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::MultiplyInplaceStridedFactory; - DispatchTableBuilder - dtb6; - dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::MultiplyInplaceContigFactory; - DispatchTableBuilder - dtb7; - dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table); - - // function pointers for inplace operation on contiguous matrix - // and contiguous row - using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory; - DispatchTableBuilder - dtb8; - dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table); -}; - -} // namespace impl - -// U25: ==== NEGATIVE (x) -namespace impl -{ - -namespace negative_fn_ns = dpctl::tensor::kernels::negative; - -static unary_contig_impl_fn_ptr_t - negative_contig_dispatch_vector[td_ns::num_types]; -static int negative_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - negative_strided_dispatch_vector[td_ns::num_types]; - -void populate_negative_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = negative_fn_ns; - - using fn_ns::NegativeContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(negative_contig_dispatch_vector); - - using fn_ns::NegativeStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(negative_strided_dispatch_vector); - - using fn_ns::NegativeTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(negative_output_typeid_vector); -} - -} // namespace impl - -// B20: ==== NOT_EQUAL (x1, x2) -namespace impl -{ -namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal; - -static binary_contig_impl_fn_ptr_t - not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_not_equal_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = not_equal_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::NotEqualTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(not_equal_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::NotEqualStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(not_equal_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::NotEqualContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(not_equal_contig_dispatch_table); -}; -} // namespace impl - -// U26: ==== POSITIVE (x) -namespace impl -{ - -namespace positive_fn_ns = dpctl::tensor::kernels::positive; - -static unary_contig_impl_fn_ptr_t - positive_contig_dispatch_vector[td_ns::num_types]; -static int positive_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - positive_strided_dispatch_vector[td_ns::num_types]; - -void populate_positive_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = positive_fn_ns; - - using fn_ns::PositiveContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(positive_contig_dispatch_vector); - - using fn_ns::PositiveStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(positive_strided_dispatch_vector); - - using fn_ns::PositiveTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(positive_output_typeid_vector); -} - -} // namespace impl - -// B21: ==== POW (x1, x2) -namespace impl -{ - -namespace pow_fn_ns = dpctl::tensor::kernels::pow; - -static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static int pow_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_pow_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = pow_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::PowTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(pow_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::PowStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(pow_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::PowContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(pow_contig_dispatch_table); -}; - -} // namespace impl - -// U??: ==== PROJ (x) -namespace impl -{ - -namespace proj_fn_ns = dpctl::tensor::kernels::proj; - -static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types]; -static int proj_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - proj_strided_dispatch_vector[td_ns::num_types]; - -void populate_proj_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = proj_fn_ns; - - using fn_ns::ProjContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(proj_contig_dispatch_vector); - - using fn_ns::ProjStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(proj_strided_dispatch_vector); - - using fn_ns::ProjTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(proj_output_typeid_vector); -} -} // namespace impl - -// U27: ==== REAL (x) -namespace impl -{ - -namespace real_fn_ns = dpctl::tensor::kernels::real; - -static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types]; -static int real_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - real_strided_dispatch_vector[td_ns::num_types]; - -void populate_real_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = real_fn_ns; - - using fn_ns::RealContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(real_contig_dispatch_vector); - - using fn_ns::RealStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(real_strided_dispatch_vector); - - using fn_ns::RealTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(real_output_typeid_vector); -} -} // namespace impl - -// B22: ==== REMAINDER (x1, x2) -namespace impl -{ - -namespace remainder_fn_ns = dpctl::tensor::kernels::remainder; - -static binary_contig_impl_fn_ptr_t - remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int remainder_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_remainder_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = remainder_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::RemainderTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(remainder_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::RemainderStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(remainder_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::RemainderContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(remainder_contig_dispatch_table); -} - -} // namespace impl - -// U28: ==== ROUND (x) -namespace impl -{ - -namespace round_fn_ns = dpctl::tensor::kernels::round; - -static unary_contig_impl_fn_ptr_t - round_contig_dispatch_vector[td_ns::num_types]; -static int round_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - round_strided_dispatch_vector[td_ns::num_types]; - -void populate_round_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = round_fn_ns; - - using fn_ns::RoundContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(round_contig_dispatch_vector); - - using fn_ns::RoundStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(round_strided_dispatch_vector); - - using fn_ns::RoundTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(round_output_typeid_vector); -} - -} // namespace impl - -// U29: ==== SIGN (x) -namespace impl -{ - -namespace sign_fn_ns = dpctl::tensor::kernels::sign; - -static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types]; -static int sign_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sign_strided_dispatch_vector[td_ns::num_types]; - -void populate_sign_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sign_fn_ns; - - using fn_ns::SignContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sign_contig_dispatch_vector); - - using fn_ns::SignStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sign_strided_dispatch_vector); - - using fn_ns::SignTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sign_output_typeid_vector); -} - -} // namespace impl - -// ==== SIGNBIT (x) -namespace impl -{ - -namespace signbit_fn_ns = dpctl::tensor::kernels::signbit; - -static unary_contig_impl_fn_ptr_t - signbit_contig_dispatch_vector[td_ns::num_types]; -static int signbit_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - signbit_strided_dispatch_vector[td_ns::num_types]; - -void populate_signbit_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = signbit_fn_ns; - - using fn_ns::SignbitContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector); - - using fn_ns::SignbitStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector); - - using fn_ns::SignbitTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(signbit_output_typeid_vector); -} - -} // namespace impl - -// U30: ==== SIN (x) -namespace impl -{ - -namespace sin_fn_ns = dpctl::tensor::kernels::sin; - -static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types]; -static int sin_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sin_strided_dispatch_vector[td_ns::num_types]; - -void populate_sin_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sin_fn_ns; - - using fn_ns::SinContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sin_contig_dispatch_vector); - - using fn_ns::SinStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sin_strided_dispatch_vector); - - using fn_ns::SinTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sin_output_typeid_vector); -} - -} // namespace impl - -// U31: ==== SINH (x) -namespace impl -{ - -namespace sinh_fn_ns = dpctl::tensor::kernels::sinh; - -static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types]; -static int sinh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sinh_strided_dispatch_vector[td_ns::num_types]; - -void populate_sinh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sinh_fn_ns; - - using fn_ns::SinhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector); - - using fn_ns::SinhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector); - - using fn_ns::SinhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sinh_output_typeid_vector); -} - -} // namespace impl - -// U32: ==== SQUARE (x) -namespace impl -{ - -namespace square_fn_ns = dpctl::tensor::kernels::square; - -static unary_contig_impl_fn_ptr_t - square_contig_dispatch_vector[td_ns::num_types]; -static int square_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - square_strided_dispatch_vector[td_ns::num_types]; - -void populate_square_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = square_fn_ns; - - using fn_ns::SquareContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(square_contig_dispatch_vector); - - using fn_ns::SquareStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(square_strided_dispatch_vector); - - using fn_ns::SquareTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(square_output_typeid_vector); -} - -} // namespace impl - -// U33: ==== SQRT (x) -namespace impl -{ - -namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt; - -static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types]; -static int sqrt_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - sqrt_strided_dispatch_vector[td_ns::num_types]; - -void populate_sqrt_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = sqrt_fn_ns; - - using fn_ns::SqrtContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector); - - using fn_ns::SqrtStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector); - - using fn_ns::SqrtTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(sqrt_output_typeid_vector); -} - -} // namespace impl - -// B23: ==== SUBTRACT (x1, x2) -namespace impl -{ -namespace subtract_fn_ns = dpctl::tensor::kernels::subtract; - -static binary_contig_impl_fn_ptr_t - subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int subtract_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -// sub(matrix, row) -static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t - subtract_contig_matrix_contig_row_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -// sub(row, matrix) -static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t - subtract_contig_row_contig_matrix_broadcast_dispatch_table - [td_ns::num_types][td_ns::num_types]; - -static binary_inplace_contig_impl_fn_ptr_t - subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_strided_impl_fn_ptr_t - subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; -static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t - subtract_inplace_row_matrix_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_subtract_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = subtract_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::SubtractTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(subtract_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::SubtractStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(subtract_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::SubtractContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(subtract_contig_dispatch_table); - - // function pointers for operation on contiguous matrix, contiguous row - // with contiguous matrix output - using fn_ns::SubtractContigMatrixContigRowBroadcastFactory; - DispatchTableBuilder< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, - SubtractContigMatrixContigRowBroadcastFactory, num_types> - dtb4; - dtb4.populate_dispatch_table( - subtract_contig_matrix_contig_row_broadcast_dispatch_table); - - // function pointers for operation on contiguous row, contiguous matrix - // with contiguous matrix output - using fn_ns::SubtractContigRowContigMatrixBroadcastFactory; - DispatchTableBuilder< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, - SubtractContigRowContigMatrixBroadcastFactory, num_types> - dtb5; - dtb5.populate_dispatch_table( - subtract_contig_row_contig_matrix_broadcast_dispatch_table); - - // function pointers for inplace operation on general strided arrays - using fn_ns::SubtractInplaceStridedFactory; - DispatchTableBuilder - dtb6; - dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table); - - // function pointers for inplace operation on contiguous inputs and output - using fn_ns::SubtractInplaceContigFactory; - DispatchTableBuilder - dtb7; - dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table); - - // function pointers for inplace operation on contiguous matrix - // and contiguous row - using fn_ns::SubtractInplaceRowMatrixBroadcastFactory; - DispatchTableBuilder - dtb8; - dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table); -}; - -} // namespace impl - -// U34: ==== TAN (x) -namespace impl -{ - -namespace tan_fn_ns = dpctl::tensor::kernels::tan; - -static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types]; -static int tan_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - tan_strided_dispatch_vector[td_ns::num_types]; - -void populate_tan_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = tan_fn_ns; - - using fn_ns::TanContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(tan_contig_dispatch_vector); - - using fn_ns::TanStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(tan_strided_dispatch_vector); - - using fn_ns::TanTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(tan_output_typeid_vector); -} - -} // namespace impl - -// U35: ==== TANH (x) -namespace impl -{ - -namespace tanh_fn_ns = dpctl::tensor::kernels::tanh; - -static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types]; -static int tanh_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - tanh_strided_dispatch_vector[td_ns::num_types]; - -void populate_tanh_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = tanh_fn_ns; - - using fn_ns::TanhContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector); - - using fn_ns::TanhStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector); - - using fn_ns::TanhTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(tanh_output_typeid_vector); -} - -} // namespace impl - -// U36: ==== TRUNC (x) -namespace impl -{ - -namespace trunc_fn_ns = dpctl::tensor::kernels::trunc; - -static unary_contig_impl_fn_ptr_t - trunc_contig_dispatch_vector[td_ns::num_types]; -static int trunc_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - trunc_strided_dispatch_vector[td_ns::num_types]; - -void populate_trunc_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = trunc_fn_ns; - - using fn_ns::TruncContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector); - - using fn_ns::TruncStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector); - - using fn_ns::TruncTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(trunc_output_typeid_vector); -} - -} // namespace impl - -// B24: ==== HYPOT (x1, x2) -namespace impl -{ -namespace hypot_fn_ns = dpctl::tensor::kernels::hypot; - -static binary_contig_impl_fn_ptr_t - hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int hypot_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_hypot_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = hypot_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::HypotTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(hypot_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::HypotStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(hypot_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::HypotContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(hypot_contig_dispatch_table); -}; - -} // namespace impl - -// U37: ==== CBRT (x) -namespace impl -{ - -namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt; - -static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types]; -static int cbrt_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - cbrt_strided_dispatch_vector[td_ns::num_types]; - -void populate_cbrt_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = cbrt_fn_ns; - - using fn_ns::CbrtContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector); - - using fn_ns::CbrtStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector); - - using fn_ns::CbrtTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(cbrt_output_typeid_vector); -} - -} // namespace impl - -// B24: ==== COPYSIGN (x1, x2) -namespace impl -{ -namespace copysign_fn_ns = dpctl::tensor::kernels::copysign; - -static binary_contig_impl_fn_ptr_t - copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; -static int copysign_output_id_table[td_ns::num_types][td_ns::num_types]; - -static binary_strided_impl_fn_ptr_t - copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; - -void populate_copysign_dispatch_tables(void) -{ - using namespace td_ns; - namespace fn_ns = copysign_fn_ns; - - // which input types are supported, and what is the type of the result - using fn_ns::CopysignTypeMapFactory; - DispatchTableBuilder dtb1; - dtb1.populate_dispatch_table(copysign_output_id_table); - - // function pointers for operation on general strided arrays - using fn_ns::CopysignStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(copysign_strided_dispatch_table); - - // function pointers for operation on contiguous inputs and output - using fn_ns::CopysignContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(copysign_contig_dispatch_table); -}; - -} // namespace impl - -// U38: ==== EXP2 (x) -namespace impl -{ - -namespace exp2_fn_ns = dpctl::tensor::kernels::exp2; - -static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types]; -static int exp2_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - exp2_strided_dispatch_vector[td_ns::num_types]; - -void populate_exp2_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = exp2_fn_ns; - - using fn_ns::Exp2ContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector); - - using fn_ns::Exp2StridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector); - - using fn_ns::Exp2TypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(exp2_output_typeid_vector); -} - -} // namespace impl - -// U39: ==== RSQRT (x) -namespace impl -{ - -namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt; - -static unary_contig_impl_fn_ptr_t - rsqrt_contig_dispatch_vector[td_ns::num_types]; -static int rsqrt_output_typeid_vector[td_ns::num_types]; -static unary_strided_impl_fn_ptr_t - rsqrt_strided_dispatch_vector[td_ns::num_types]; - -void populate_rsqrt_dispatch_vectors(void) -{ - using namespace td_ns; - namespace fn_ns = rsqrt_fn_ns; - - using fn_ns::RsqrtContigFactory; - DispatchVectorBuilder - dvb1; - dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector); - - using fn_ns::RsqrtStridedFactory; - DispatchVectorBuilder - dvb2; - dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector); - - using fn_ns::RsqrtTypeMapFactory; - DispatchVectorBuilder dvb3; - dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector); -} - -} // namespace impl - -// ========================================================================================== -// // - -namespace py = pybind11; - -void init_elementwise_functions(py::module_ m) -{ - using arrayT = dpctl::tensor::usm_ndarray; - using event_vecT = std::vector; - - // U01: ==== ABS (x) - { - impl::populate_abs_dispatch_vectors(); - using impl::abs_contig_dispatch_vector; - using impl::abs_output_typeid_vector; - using impl::abs_strided_dispatch_vector; - - auto abs_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, abs_output_typeid_vector, - abs_contig_dispatch_vector, abs_strided_dispatch_vector); - }; - m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto abs_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector); - }; - m.def("_abs_result_type", abs_result_type_pyapi); - } - - // U02: ==== ACOS (x) - { - impl::populate_acos_dispatch_vectors(); - using impl::acos_contig_dispatch_vector; - using impl::acos_output_typeid_vector; - using impl::acos_strided_dispatch_vector; - - auto acos_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, acos_output_typeid_vector, - acos_contig_dispatch_vector, acos_strided_dispatch_vector); - }; - m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto acos_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector); - }; - m.def("_acos_result_type", acos_result_type_pyapi); - } - - // U03: ===== ACOSH (x) - { - impl::populate_acosh_dispatch_vectors(); - using impl::acosh_contig_dispatch_vector; - using impl::acosh_output_typeid_vector; - using impl::acosh_strided_dispatch_vector; - - auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, acosh_output_typeid_vector, - acosh_contig_dispatch_vector, acosh_strided_dispatch_vector); - }; - m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto acosh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - acosh_output_typeid_vector); - }; - m.def("_acosh_result_type", acosh_result_type_pyapi); - } - - // B01: ===== ADD (x1, x2) - { - impl::populate_add_dispatch_tables(); - using impl::add_contig_dispatch_table; - using impl::add_contig_matrix_contig_row_broadcast_dispatch_table; - using impl::add_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::add_output_id_table; - using impl::add_strided_dispatch_table; - - auto add_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, add_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - add_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - add_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - add_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - add_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto add_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - add_output_id_table); - }; - m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_add_result_type", add_result_type_pyapi, ""); - - using impl::add_inplace_contig_dispatch_table; - using impl::add_inplace_row_matrix_dispatch_table; - using impl::add_inplace_strided_dispatch_table; - - auto add_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, add_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - add_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - add_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - add_inplace_row_matrix_dispatch_table); - }; - m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"), - py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // U04: ===== ASIN (x) - { - impl::populate_asin_dispatch_vectors(); - using impl::asin_contig_dispatch_vector; - using impl::asin_output_typeid_vector; - using impl::asin_strided_dispatch_vector; - - auto asin_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, asin_output_typeid_vector, - asin_contig_dispatch_vector, asin_strided_dispatch_vector); - }; - m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto asin_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector); - }; - m.def("_asin_result_type", asin_result_type_pyapi); - } - - // U05: ===== ASINH (x) - { - impl::populate_asinh_dispatch_vectors(); - using impl::asinh_contig_dispatch_vector; - using impl::asinh_output_typeid_vector; - using impl::asinh_strided_dispatch_vector; - - auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, asinh_output_typeid_vector, - asinh_contig_dispatch_vector, asinh_strided_dispatch_vector); - }; - m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto asinh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - asinh_output_typeid_vector); - }; - m.def("_asinh_result_type", asinh_result_type_pyapi); - } - - // U06: ===== ATAN (x) - { - impl::populate_atan_dispatch_vectors(); - using impl::atan_contig_dispatch_vector; - using impl::atan_output_typeid_vector; - using impl::atan_strided_dispatch_vector; - - auto atan_pyapi = [&](arrayT src, arrayT dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, atan_output_typeid_vector, - atan_contig_dispatch_vector, atan_strided_dispatch_vector); - }; - m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto atan_result_type_pyapi = [&](py::dtype dtype) { - return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector); - }; - m.def("_atan_result_type", atan_result_type_pyapi); - } - - // B02: ===== ATAN2 (x1, x2) - { - impl::populate_atan2_dispatch_tables(); - using impl::atan2_contig_dispatch_table; - using impl::atan2_output_id_table; - using impl::atan2_strided_dispatch_table; - - auto atan2_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, atan2_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - atan2_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - atan2_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto atan2_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - atan2_output_id_table); - }; - m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_atan2_result_type", atan2_result_type_pyapi, ""); - } - - // U07: ===== ATANH (x) - { - impl::populate_atanh_dispatch_vectors(); - using impl::atanh_contig_dispatch_vector; - using impl::atanh_output_typeid_vector; - using impl::atanh_strided_dispatch_vector; - - auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, atanh_output_typeid_vector, - atanh_contig_dispatch_vector, atanh_strided_dispatch_vector); - }; - m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto atanh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - atanh_output_typeid_vector); - }; - m.def("_atanh_result_type", atanh_result_type_pyapi); - } - - // B03: ===== BITWISE_AND (x1, x2) - { - impl::populate_bitwise_and_dispatch_tables(); - using impl::bitwise_and_contig_dispatch_table; - using impl::bitwise_and_output_id_table; - using impl::bitwise_and_strided_dispatch_table; - - auto bitwise_and_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, bitwise_and_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_and_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_and_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - bitwise_and_output_id_table); - }; - m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, ""); - } - - // B04: ===== BITWISE_LEFT_SHIFT (x1, x2) - { - impl::populate_bitwise_left_shift_dispatch_tables(); - using impl::bitwise_left_shift_contig_dispatch_table; - using impl::bitwise_left_shift_output_id_table; - using impl::bitwise_left_shift_strided_dispatch_table; - - auto bitwise_left_shift_pyapi = [&](const dpctl::tensor::usm_ndarray - &src1, - const dpctl::tensor::usm_ndarray - &src2, - const dpctl::tensor::usm_ndarray - &dst, - sycl::queue &exec_q, - const std::vector - &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, - bitwise_left_shift_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_left_shift_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_left_shift_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_left_shift_result_type_pyapi = - [&](const py::dtype &dtype1, const py::dtype &dtype2) { - return py_binary_ufunc_result_type( - dtype1, dtype2, bitwise_left_shift_output_id_table); - }; - m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "", - py::arg("src1"), py::arg("src2"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_bitwise_left_shift_result_type", - bitwise_left_shift_result_type_pyapi, ""); - } - - // U08: ===== BITWISE_INVERT (x) - { - impl::populate_bitwise_invert_dispatch_vectors(); - using impl::bitwise_invert_contig_dispatch_vector; - using impl::bitwise_invert_output_typeid_vector; - using impl::bitwise_invert_strided_dispatch_vector; - - auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - bitwise_invert_output_typeid_vector, - bitwise_invert_contig_dispatch_vector, - bitwise_invert_strided_dispatch_vector); - }; - m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - - auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type( - dtype, bitwise_invert_output_typeid_vector); - }; - m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi); - } - - // B05: ===== BITWISE_OR (x1, x2) - { - impl::populate_bitwise_or_dispatch_tables(); - using impl::bitwise_or_contig_dispatch_table; - using impl::bitwise_or_output_id_table; - using impl::bitwise_or_strided_dispatch_table; - - auto bitwise_or_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, bitwise_or_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_or_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_or_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - bitwise_or_output_id_table); - }; - m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, ""); - } - - // B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) - { - impl::populate_bitwise_right_shift_dispatch_tables(); - using impl::bitwise_right_shift_contig_dispatch_table; - using impl::bitwise_right_shift_output_id_table; - using impl::bitwise_right_shift_strided_dispatch_table; - - auto bitwise_right_shift_pyapi = [&](const dpctl::tensor::usm_ndarray - &src1, - const dpctl::tensor::usm_ndarray - &src2, - const dpctl::tensor::usm_ndarray - &dst, - sycl::queue &exec_q, - const std::vector - &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, - bitwise_right_shift_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_right_shift_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_right_shift_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_right_shift_result_type_pyapi = - [&](const py::dtype &dtype1, const py::dtype &dtype2) { - return py_binary_ufunc_result_type( - dtype1, dtype2, bitwise_right_shift_output_id_table); - }; - m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "", - py::arg("src1"), py::arg("src2"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_bitwise_right_shift_result_type", - bitwise_right_shift_result_type_pyapi, ""); - } - - // B07: ===== BITWISE_XOR (x1, x2) - { - impl::populate_bitwise_xor_dispatch_tables(); - using impl::bitwise_xor_contig_dispatch_table; - using impl::bitwise_xor_output_id_table; - using impl::bitwise_xor_strided_dispatch_table; - - auto bitwise_xor_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - bitwise_xor_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - bitwise_xor_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - bitwise_xor_output_id_table); - }; - m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, ""); - } - - // U09: ==== CEIL (x) - { - impl::populate_ceil_dispatch_vectors(); - using impl::ceil_contig_dispatch_vector; - using impl::ceil_output_typeid_vector; - using impl::ceil_strided_dispatch_vector; - - auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, ceil_output_typeid_vector, - ceil_contig_dispatch_vector, ceil_strided_dispatch_vector); - }; - m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto ceil_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector); - }; - m.def("_ceil_result_type", ceil_result_type_pyapi); - } - - // U10: ==== CONJ (x) - { - impl::populate_conj_dispatch_vectors(); - using impl::conj_contig_dispatch_vector; - using impl::conj_output_typeid_vector; - using impl::conj_strided_dispatch_vector; - - auto conj_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, conj_output_typeid_vector, - conj_contig_dispatch_vector, conj_strided_dispatch_vector); - }; - m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto conj_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector); - }; - m.def("_conj_result_type", conj_result_type_pyapi); - } - - // U11: ==== COS (x) - { - impl::populate_cos_dispatch_vectors(); - using impl::cos_contig_dispatch_vector; - using impl::cos_output_typeid_vector; - using impl::cos_strided_dispatch_vector; - - auto cos_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, cos_output_typeid_vector, - cos_contig_dispatch_vector, cos_strided_dispatch_vector); - }; - m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto cos_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector); - }; - m.def("_cos_result_type", cos_result_type_pyapi); - } - - // U12: ==== COSH (x) - { - impl::populate_cosh_dispatch_vectors(); - using impl::cosh_contig_dispatch_vector; - using impl::cosh_output_typeid_vector; - using impl::cosh_strided_dispatch_vector; - - auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, cosh_output_typeid_vector, - cosh_contig_dispatch_vector, cosh_strided_dispatch_vector); - }; - m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto cosh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector); - }; - m.def("_cosh_result_type", cosh_result_type_pyapi); - } - - // B08: ==== DIVIDE (x1, x2) - { - impl::populate_true_divide_dispatch_tables(); - using impl::true_divide_contig_dispatch_table; - using impl:: - true_divide_contig_matrix_contig_row_broadcast_dispatch_table; - using impl:: - true_divide_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::true_divide_output_id_table; - using impl::true_divide_strided_dispatch_table; - - auto divide_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, true_divide_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - true_divide_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - true_divide_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - true_divide_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - true_divide_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto divide_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - true_divide_output_id_table); - }; - m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_divide_result_type", divide_result_type_pyapi, ""); - - using impl::true_divide_inplace_contig_dispatch_table; - using impl::true_divide_inplace_output_id_table; - using impl::true_divide_inplace_row_matrix_dispatch_table; - using impl::true_divide_inplace_strided_dispatch_table; - - auto divide_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, - true_divide_inplace_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - true_divide_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - true_divide_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - true_divide_inplace_row_matrix_dispatch_table); - }; - m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"), - py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // B09: ==== EQUAL (x1, x2) - { - impl::populate_equal_dispatch_tables(); - using impl::equal_contig_dispatch_table; - using impl::equal_output_id_table; - using impl::equal_strided_dispatch_table; - - auto equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - equal_output_id_table); - }; - m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_equal_result_type", equal_result_type_pyapi, ""); - } - - // U13: ==== EXP (x) - { - impl::populate_exp_dispatch_vectors(); - using impl::exp_contig_dispatch_vector; - using impl::exp_output_typeid_vector; - using impl::exp_strided_dispatch_vector; - - auto exp_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, exp_output_typeid_vector, - exp_contig_dispatch_vector, exp_strided_dispatch_vector); - }; - m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto exp_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector); - }; - m.def("_exp_result_type", exp_result_type_pyapi); - } - - // U14: ==== EXPM1 (x) - { - impl::populate_expm1_dispatch_vectors(); - using impl::expm1_contig_dispatch_vector; - using impl::expm1_output_typeid_vector; - using impl::expm1_strided_dispatch_vector; - - auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, expm1_output_typeid_vector, - expm1_contig_dispatch_vector, expm1_strided_dispatch_vector); - }; - m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto expm1_result_type_pyapi = [&](const py::dtype dtype) { - return py_unary_ufunc_result_type(dtype, - expm1_output_typeid_vector); - }; - m.def("_expm1_result_type", expm1_result_type_pyapi); - } - - // U15: ==== FLOOR (x) - { - impl::populate_floor_dispatch_vectors(); - using impl::floor_contig_dispatch_vector; - using impl::floor_output_typeid_vector; - using impl::floor_strided_dispatch_vector; - - auto floor_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, floor_output_typeid_vector, - floor_contig_dispatch_vector, floor_strided_dispatch_vector); - }; - m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto floor_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - floor_output_typeid_vector); - }; - m.def("_floor_result_type", floor_result_type_pyapi); - } - - // B10: ==== FLOOR_DIVIDE (x1, x2) - { - impl::populate_floor_divide_dispatch_tables(); - using impl::floor_divide_contig_dispatch_table; - using impl::floor_divide_output_id_table; - using impl::floor_divide_strided_dispatch_table; - - auto floor_divide_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, floor_divide_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - floor_divide_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - floor_divide_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - floor_divide_output_id_table); - }; - m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, ""); - - using impl::floor_divide_inplace_contig_dispatch_table; - using impl::floor_divide_inplace_strided_dispatch_table; - - auto floor_divide_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, floor_divide_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - floor_divide_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - floor_divide_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - td_ns::NullPtrTable< - binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); - }; - m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "", - py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // B11: ==== GREATER (x1, x2) - { - impl::populate_greater_dispatch_tables(); - using impl::greater_contig_dispatch_table; - using impl::greater_output_id_table; - using impl::greater_strided_dispatch_table; - - auto greater_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, greater_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - greater_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - greater_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto greater_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - greater_output_id_table); - }; - m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_greater_result_type", greater_result_type_pyapi, ""); - } - - // B12: ==== GREATER_EQUAL (x1, x2) - { - impl::populate_greater_equal_dispatch_tables(); - using impl::greater_equal_contig_dispatch_table; - using impl::greater_equal_output_id_table; - using impl::greater_equal_strided_dispatch_table; - - auto greater_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, greater_equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - greater_equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - greater_equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - greater_equal_output_id_table); - }; - m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_greater_equal_result_type", greater_equal_result_type_pyapi, - ""); - } - - // U16: ==== IMAG (x) - { - impl::populate_imag_dispatch_vectors(); - using impl::imag_contig_dispatch_vector; - using impl::imag_output_typeid_vector; - using impl::imag_strided_dispatch_vector; - - auto imag_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, imag_output_typeid_vector, - imag_contig_dispatch_vector, imag_strided_dispatch_vector); - }; - m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto imag_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector); - }; - m.def("_imag_result_type", imag_result_type_pyapi); - } - - // U17: ==== ISFINITE (x) - { - impl::populate_isfinite_dispatch_vectors(); - - using impl::isfinite_contig_dispatch_vector; - using impl::isfinite_output_typeid_vector; - using impl::isfinite_strided_dispatch_vector; - auto isfinite_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - isfinite_output_typeid_vector, - isfinite_contig_dispatch_vector, - isfinite_strided_dispatch_vector); - }; - auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - isfinite_output_typeid_vector); - }; - m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_isfinite_result_type", isfinite_result_type_pyapi, ""); - } - - // U18: ==== ISINF (x) - { - impl::populate_isinf_dispatch_vectors(); - - using impl::isinf_contig_dispatch_vector; - using impl::isinf_output_typeid_vector; - using impl::isinf_strided_dispatch_vector; - auto isinf_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, isinf_output_typeid_vector, - isinf_contig_dispatch_vector, isinf_strided_dispatch_vector); - }; - auto isinf_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - isinf_output_typeid_vector); - }; - m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_isinf_result_type", isinf_result_type_pyapi, ""); - } - - // U19: ==== ISNAN (x) - { - impl::populate_isnan_dispatch_vectors(); - - using impl::isnan_contig_dispatch_vector; - using impl::isnan_output_typeid_vector; - using impl::isnan_strided_dispatch_vector; - auto isnan_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, isnan_output_typeid_vector, - isnan_contig_dispatch_vector, isnan_strided_dispatch_vector); - }; - auto isnan_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - isnan_output_typeid_vector); - }; - m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_isnan_result_type", isnan_result_type_pyapi, ""); - } - - // B13: ==== LESS (x1, x2) - { - impl::populate_less_dispatch_tables(); - using impl::less_contig_dispatch_table; - using impl::less_output_id_table; - using impl::less_strided_dispatch_table; - - auto less_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, less_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - less_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - less_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto less_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - less_output_id_table); - }; - m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_less_result_type", less_result_type_pyapi, ""); - } - - // B14: ==== LESS_EQUAL (x1, x2) - { - impl::populate_less_equal_dispatch_tables(); - using impl::less_equal_contig_dispatch_table; - using impl::less_equal_output_id_table; - using impl::less_equal_strided_dispatch_table; - - auto less_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, less_equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - less_equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - less_equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - less_equal_output_id_table); - }; - m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_less_equal_result_type", less_equal_result_type_pyapi, ""); - } - - // U20: ==== LOG (x) - { - impl::populate_log_dispatch_vectors(); - using impl::log_contig_dispatch_vector; - using impl::log_output_typeid_vector; - using impl::log_strided_dispatch_vector; - - auto log_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log_output_typeid_vector, - log_contig_dispatch_vector, log_strided_dispatch_vector); - }; - m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto log_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, log_output_typeid_vector); - }; - m.def("_log_result_type", log_result_type_pyapi); - } - - // U21: ==== LOG1P (x) - { - impl::populate_log1p_dispatch_vectors(); - using impl::log1p_contig_dispatch_vector; - using impl::log1p_output_typeid_vector; - using impl::log1p_strided_dispatch_vector; - - auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log1p_output_typeid_vector, - log1p_contig_dispatch_vector, log1p_strided_dispatch_vector); - }; - m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto log1p_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - log1p_output_typeid_vector); - }; - m.def("_log1p_result_type", log1p_result_type_pyapi); - } - - // U22: ==== LOG2 (x) - { - impl::populate_log2_dispatch_vectors(); - - using impl::log2_contig_dispatch_vector; - using impl::log2_output_typeid_vector; - using impl::log2_strided_dispatch_vector; - auto log2_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log2_output_typeid_vector, - log2_contig_dispatch_vector, log2_strided_dispatch_vector); - }; - auto log2_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector); - }; - m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_log2_result_type", log2_result_type_pyapi, ""); - } - - // U23: ==== LOG10 (x) - { - impl::populate_log10_dispatch_vectors(); - - using impl::log10_contig_dispatch_vector; - using impl::log10_output_typeid_vector; - using impl::log10_strided_dispatch_vector; - auto log10_pyapi = [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, log10_output_typeid_vector, - log10_contig_dispatch_vector, log10_strided_dispatch_vector); - }; - auto log10_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - log10_output_typeid_vector); - }; - m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - m.def("_log10_result_type", log10_result_type_pyapi, ""); - } - - // B15: ==== LOGADDEXP (x1, x2) - { - impl::populate_logaddexp_dispatch_tables(); - using impl::logaddexp_contig_dispatch_table; - using impl::logaddexp_output_id_table; - using impl::logaddexp_strided_dispatch_table; - - auto logaddexp_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logaddexp_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logaddexp_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logaddexp_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logaddexp_output_id_table); - }; - m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, ""); - } - - // B16: ==== LOGICAL_AND (x1, x2) - { - impl::populate_logical_and_dispatch_tables(); - using impl::logical_and_contig_dispatch_table; - using impl::logical_and_output_id_table; - using impl::logical_and_strided_dispatch_table; - - auto logical_and_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logical_and_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logical_and_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logical_and_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logical_and_output_id_table); - }; - m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logical_and_result_type", logical_and_result_type_pyapi, ""); - } - - // U24: ==== LOGICAL_NOT (x) - { - impl::populate_logical_not_dispatch_vectors(); - using impl::logical_not_contig_dispatch_vector; - using impl::logical_not_output_typeid_vector; - using impl::logical_not_strided_dispatch_vector; - - auto logical_not_pyapi = [&](const arrayT &src, arrayT dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - logical_not_output_typeid_vector, - logical_not_contig_dispatch_vector, - logical_not_strided_dispatch_vector); - }; - m.def("_logical_not", logical_not_pyapi, "", py::arg("src"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - - auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - logical_not_output_typeid_vector); - }; - m.def("_logical_not_result_type", logical_not_result_type_pyapi); - } - - // B17: ==== LOGICAL_OR (x1, x2) - { - impl::populate_logical_or_dispatch_tables(); - using impl::logical_or_contig_dispatch_table; - using impl::logical_or_output_id_table; - using impl::logical_or_strided_dispatch_table; - - auto logical_or_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logical_or_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logical_or_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logical_or_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logical_or_output_id_table); - }; - m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logical_or_result_type", logical_or_result_type_pyapi, ""); - } - - // B18: ==== LOGICAL_XOR (x1, x2) - { - impl::populate_logical_xor_dispatch_tables(); - using impl::logical_xor_contig_dispatch_table; - using impl::logical_xor_output_id_table; - using impl::logical_xor_strided_dispatch_table; - - auto logical_xor_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, logical_xor_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - logical_xor_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - logical_xor_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - logical_xor_output_id_table); - }; - m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, ""); - } - - // B??: ==== MAXIMUM (x1, x2) - { - impl::populate_maximum_dispatch_tables(); - using impl::maximum_contig_dispatch_table; - using impl::maximum_output_id_table; - using impl::maximum_strided_dispatch_table; - - auto maximum_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, maximum_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - maximum_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - maximum_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto maximum_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - maximum_output_id_table); - }; - m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_maximum_result_type", maximum_result_type_pyapi, ""); - } - - // B??: ==== MINIMUM (x1, x2) - { - impl::populate_minimum_dispatch_tables(); - using impl::minimum_contig_dispatch_table; - using impl::minimum_output_id_table; - using impl::minimum_strided_dispatch_table; - - auto minimum_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, minimum_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - minimum_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - minimum_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto minimum_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - minimum_output_id_table); - }; - m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_minimum_result_type", minimum_result_type_pyapi, ""); - } - - // B19: ==== MULTIPLY (x1, x2) - { - impl::populate_multiply_dispatch_tables(); - using impl::multiply_contig_dispatch_table; - using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table; - using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::multiply_output_id_table; - using impl::multiply_strided_dispatch_table; - - auto multiply_pyapi = - [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, multiply_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - multiply_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - multiply_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - multiply_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - multiply_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto multiply_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - multiply_output_id_table); - }; - m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_multiply_result_type", multiply_result_type_pyapi, ""); - - using impl::multiply_inplace_contig_dispatch_table; - using impl::multiply_inplace_row_matrix_dispatch_table; - using impl::multiply_inplace_strided_dispatch_table; - - auto multiply_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, multiply_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - multiply_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - multiply_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - multiply_inplace_row_matrix_dispatch_table); - }; - m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"), - py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // U25: ==== NEGATIVE (x) - { - impl::populate_negative_dispatch_vectors(); - using impl::negative_contig_dispatch_vector; - using impl::negative_output_typeid_vector; - using impl::negative_strided_dispatch_vector; - - auto negative_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - negative_output_typeid_vector, - negative_contig_dispatch_vector, - negative_strided_dispatch_vector); - }; - m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto negative_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - negative_output_typeid_vector); - }; - m.def("_negative_result_type", negative_result_type_pyapi); - } - - // B20: ==== NOT_EQUAL (x1, x2) - { - impl::populate_not_equal_dispatch_tables(); - using impl::not_equal_contig_dispatch_table; - using impl::not_equal_output_id_table; - using impl::not_equal_strided_dispatch_table; - - auto not_equal_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, not_equal_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - not_equal_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - not_equal_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - not_equal_output_id_table); - }; - m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_not_equal_result_type", not_equal_result_type_pyapi, ""); - } - - // U26: ==== POSITIVE (x) - { - impl::populate_positive_dispatch_vectors(); - using impl::positive_contig_dispatch_vector; - using impl::positive_output_typeid_vector; - using impl::positive_strided_dispatch_vector; - - auto positive_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - positive_output_typeid_vector, - positive_contig_dispatch_vector, - positive_strided_dispatch_vector); - }; - m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto positive_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - positive_output_typeid_vector); - }; - m.def("_positive_result_type", positive_result_type_pyapi); - } - - // B21: ==== POW (x1, x2) - { - impl::populate_pow_dispatch_tables(); - using impl::pow_contig_dispatch_table; - using impl::pow_output_id_table; - using impl::pow_strided_dispatch_table; - - auto pow_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, pow_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - pow_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - pow_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto pow_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - pow_output_id_table); - }; - m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_pow_result_type", pow_result_type_pyapi, ""); - } - - // U??: ==== PROJ (x) - { - impl::populate_proj_dispatch_vectors(); - using impl::proj_contig_dispatch_vector; - using impl::proj_output_typeid_vector; - using impl::proj_strided_dispatch_vector; - - auto proj_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, proj_output_typeid_vector, - proj_contig_dispatch_vector, proj_strided_dispatch_vector); - }; - m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto proj_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector); - }; - m.def("_proj_result_type", proj_result_type_pyapi); - } - - // U27: ==== REAL (x) - { - impl::populate_real_dispatch_vectors(); - using impl::real_contig_dispatch_vector; - using impl::real_output_typeid_vector; - using impl::real_strided_dispatch_vector; - - auto real_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, real_output_typeid_vector, - real_contig_dispatch_vector, real_strided_dispatch_vector); - }; - m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto real_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, real_output_typeid_vector); - }; - m.def("_real_result_type", real_result_type_pyapi); - } - - // B22: ==== REMAINDER (x1, x2) - { - impl::populate_remainder_dispatch_tables(); - using impl::remainder_contig_dispatch_table; - using impl::remainder_output_id_table; - using impl::remainder_strided_dispatch_table; - - auto remainder_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, remainder_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - remainder_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - remainder_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto remainder_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - remainder_output_id_table); - }; - m.def("_remainder", remainder_pyapi, "", py::arg("src1"), - py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_remainder_result_type", remainder_result_type_pyapi, ""); - } - - // U28: ==== ROUND (x) - { - impl::populate_round_dispatch_vectors(); - using impl::round_contig_dispatch_vector; - using impl::round_output_typeid_vector; - using impl::round_strided_dispatch_vector; - - auto round_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, round_output_typeid_vector, - round_contig_dispatch_vector, round_strided_dispatch_vector); - }; - m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto round_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - round_output_typeid_vector); - }; - m.def("_round_result_type", round_result_type_pyapi); - } - - // U29: ==== SIGN (x) - { - impl::populate_sign_dispatch_vectors(); - using impl::sign_contig_dispatch_vector; - using impl::sign_output_typeid_vector; - using impl::sign_strided_dispatch_vector; - - auto sign_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sign_output_typeid_vector, - sign_contig_dispatch_vector, sign_strided_dispatch_vector); - }; - m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sign_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector); - }; - m.def("_sign_result_type", sign_result_type_pyapi); - } - - // ==== SIGNBIT (x) - { - impl::populate_signbit_dispatch_vectors(); - using impl::signbit_contig_dispatch_vector; - using impl::signbit_output_typeid_vector; - using impl::signbit_strided_dispatch_vector; - - auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc(src, dst, exec_q, depends, - signbit_output_typeid_vector, - signbit_contig_dispatch_vector, - signbit_strided_dispatch_vector); - }; - m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto signbit_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - signbit_output_typeid_vector); - }; - m.def("_signbit_result_type", signbit_result_type_pyapi); - } - - // U30: ==== SIN (x) - { - impl::populate_sin_dispatch_vectors(); - using impl::sin_contig_dispatch_vector; - using impl::sin_output_typeid_vector; - using impl::sin_strided_dispatch_vector; - - auto sin_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sin_output_typeid_vector, - sin_contig_dispatch_vector, sin_strided_dispatch_vector); - }; - m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sin_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector); - }; - m.def("_sin_result_type", sin_result_type_pyapi); - } - // U31: ==== SINH (x) - { - impl::populate_sinh_dispatch_vectors(); - using impl::sinh_contig_dispatch_vector; - using impl::sinh_output_typeid_vector; - using impl::sinh_strided_dispatch_vector; - - auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sinh_output_typeid_vector, - sinh_contig_dispatch_vector, sinh_strided_dispatch_vector); - }; - m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sinh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector); - }; - m.def("_sinh_result_type", sinh_result_type_pyapi); - } - - // U32: ==== SQUARE (x) - { - impl::populate_square_dispatch_vectors(); - using impl::square_contig_dispatch_vector; - using impl::square_output_typeid_vector; - using impl::square_strided_dispatch_vector; - - auto square_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, square_output_typeid_vector, - square_contig_dispatch_vector, square_strided_dispatch_vector); - }; - m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto square_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - square_output_typeid_vector); - }; - m.def("_square_result_type", square_result_type_pyapi); - } - - // U33: ==== SQRT (x) - { - impl::populate_sqrt_dispatch_vectors(); - using impl::sqrt_contig_dispatch_vector; - using impl::sqrt_output_typeid_vector; - using impl::sqrt_strided_dispatch_vector; - - auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, sqrt_output_typeid_vector, - sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector); - }; - m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector); - }; - m.def("_sqrt_result_type", sqrt_result_type_pyapi); - } - - // B23: ==== SUBTRACT (x1, x2) - { - impl::populate_subtract_dispatch_tables(); - using impl::subtract_contig_dispatch_table; - using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table; - using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table; - using impl::subtract_output_id_table; - using impl::subtract_strided_dispatch_table; - - auto subtract_pyapi = - [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, subtract_output_id_table, - // function pointers to handle operation on contiguous - // arrays (pointers may be nullptr) - subtract_contig_dispatch_table, - // function pointers to handle operation on strided arrays - // (most general case) - subtract_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - subtract_contig_matrix_contig_row_broadcast_dispatch_table, - // function pointers to handle operation of c-contig matrix - // and c-contig row with broadcasting (may be nullptr) - subtract_contig_row_contig_matrix_broadcast_dispatch_table); - }; - auto subtract_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - subtract_output_id_table); - }; - m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_subtract_result_type", subtract_result_type_pyapi, ""); - - using impl::subtract_inplace_contig_dispatch_table; - using impl::subtract_inplace_row_matrix_dispatch_table; - using impl::subtract_inplace_strided_dispatch_table; - - auto subtract_inplace_pyapi = - [&](const dpctl::tensor::usm_ndarray &src, - const dpctl::tensor::usm_ndarray &dst, sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_inplace_ufunc( - src, dst, exec_q, depends, subtract_output_id_table, - // function pointers to handle inplace operation on - // contiguous arrays (pointers may be nullptr) - subtract_inplace_contig_dispatch_table, - // function pointers to handle inplace operation on strided - // arrays (most general case) - subtract_inplace_strided_dispatch_table, - // function pointers to handle inplace operation on - // c-contig matrix with c-contig row with broadcasting - // (may be nullptr) - subtract_inplace_row_matrix_dispatch_table); - }; - m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"), - py::arg("rhs"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - } - - // U34: ==== TAN (x) - { - impl::populate_tan_dispatch_vectors(); - using impl::tan_contig_dispatch_vector; - using impl::tan_output_typeid_vector; - using impl::tan_strided_dispatch_vector; - - auto tan_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, tan_output_typeid_vector, - tan_contig_dispatch_vector, tan_strided_dispatch_vector); - }; - m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto tan_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector); - }; - m.def("_tan_result_type", tan_result_type_pyapi); - } - - // U35: ==== TANH (x) - { - impl::populate_tanh_dispatch_vectors(); - using impl::tanh_contig_dispatch_vector; - using impl::tanh_output_typeid_vector; - using impl::tanh_strided_dispatch_vector; - - auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, tanh_output_typeid_vector, - tanh_contig_dispatch_vector, tanh_strided_dispatch_vector); - }; - m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto tanh_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector); - }; - m.def("_tanh_result_type", tanh_result_type_pyapi); - } - - // U36: ==== TRUNC (x) - { - impl::populate_trunc_dispatch_vectors(); - using impl::trunc_contig_dispatch_vector; - using impl::trunc_output_typeid_vector; - using impl::trunc_strided_dispatch_vector; - - auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, trunc_output_typeid_vector, - trunc_contig_dispatch_vector, trunc_strided_dispatch_vector); - }; - m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto trunc_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - trunc_output_typeid_vector); - }; - m.def("_trunc_result_type", trunc_result_type_pyapi); - } - - // B24: ==== HYPOT (x1, x2) - { - impl::populate_hypot_dispatch_tables(); - using impl::hypot_contig_dispatch_table; - using impl::hypot_output_id_table; - using impl::hypot_strided_dispatch_table; - - auto hypot_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, hypot_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - hypot_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - hypot_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto hypot_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - hypot_output_id_table); - }; - m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_hypot_result_type", hypot_result_type_pyapi, ""); - } - - // U37: ==== CBRT (x) - { - impl::populate_cbrt_dispatch_vectors(); - using impl::cbrt_contig_dispatch_vector; - using impl::cbrt_output_typeid_vector; - using impl::cbrt_strided_dispatch_vector; - - auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, cbrt_output_typeid_vector, - cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector); - }; - m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector); - }; - m.def("_cbrt_result_type", cbrt_result_type_pyapi); - } - - // B25: ==== COPYSIGN (x1, x2) - { - impl::populate_copysign_dispatch_tables(); - using impl::copysign_contig_dispatch_table; - using impl::copysign_output_id_table; - using impl::copysign_strided_dispatch_table; - - auto copysign_pyapi = [&](const dpctl::tensor::usm_ndarray &src1, - const dpctl::tensor::usm_ndarray &src2, - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends = - {}) { - return py_binary_ufunc( - src1, src2, dst, exec_q, depends, copysign_output_id_table, - // function pointers to handle operation on contiguous arrays - // (pointers may be nullptr) - copysign_contig_dispatch_table, - // function pointers to handle operation on strided arrays (most - // general case) - copysign_strided_dispatch_table, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, - // function pointers to handle operation of c-contig matrix and - // c-contig row with broadcasting (may be nullptr) - td_ns::NullPtrTable< - binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); - }; - auto copysign_result_type_pyapi = [&](const py::dtype &dtype1, - const py::dtype &dtype2) { - return py_binary_ufunc_result_type(dtype1, dtype2, - copysign_output_id_table); - }; - m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"), - py::arg("dst"), py::arg("sycl_queue"), - py::arg("depends") = py::list()); - m.def("_copysign_result_type", copysign_result_type_pyapi, ""); - } - - // U38: ==== EXP2 (x) - { - impl::populate_exp2_dispatch_vectors(); - using impl::exp2_contig_dispatch_vector; - using impl::exp2_output_typeid_vector; - using impl::exp2_strided_dispatch_vector; - - auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, exp2_output_typeid_vector, - exp2_contig_dispatch_vector, exp2_strided_dispatch_vector); - }; - m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto exp2_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector); - }; - m.def("_exp2_result_type", exp2_result_type_pyapi); - } - - // U39: ==== RSQRT (x) - { - impl::populate_rsqrt_dispatch_vectors(); - using impl::rsqrt_contig_dispatch_vector; - using impl::rsqrt_output_typeid_vector; - using impl::rsqrt_strided_dispatch_vector; - - auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst, - sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_unary_ufunc( - src, dst, exec_q, depends, rsqrt_output_typeid_vector, - rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector); - }; - m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) { - return py_unary_ufunc_result_type(dtype, - rsqrt_output_typeid_vector); - }; - m.def("_rsqrt_result_type", rsqrt_result_type_pyapi); - } -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp new file mode 100644 index 0000000000..4b3e8b635b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "abs.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/abs.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U01: ==== ABS (x) +namespace impl +{ + +namespace abs_fn_ns = dpctl::tensor::kernels::abs; + +static unary_contig_impl_fn_ptr_t abs_contig_dispatch_vector[td_ns::num_types]; +static int abs_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + abs_strided_dispatch_vector[td_ns::num_types]; + +void populate_abs_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = abs_fn_ns; + + using fn_ns::AbsContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(abs_contig_dispatch_vector); + + using fn_ns::AbsStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(abs_strided_dispatch_vector); + + using fn_ns::AbsTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(abs_output_typeid_vector); +}; + +} // namespace impl + +void init_abs(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_abs_dispatch_vectors(); + using impl::abs_contig_dispatch_vector; + using impl::abs_output_typeid_vector; + using impl::abs_strided_dispatch_vector; + + auto abs_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, abs_output_typeid_vector, + abs_contig_dispatch_vector, abs_strided_dispatch_vector); + }; + m.def("_abs", abs_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto abs_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, abs_output_typeid_vector); + }; + m.def("_abs_result_type", abs_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp new file mode 100644 index 0000000000..d09eafc6bd --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/abs.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_abs(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp new file mode 100644 index 0000000000..011cc052fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "acos.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/acos.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U02: ==== ACOS (x) +namespace impl +{ + +namespace acos_fn_ns = dpctl::tensor::kernels::acos; + +static unary_contig_impl_fn_ptr_t acos_contig_dispatch_vector[td_ns::num_types]; +static int acos_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + acos_strided_dispatch_vector[td_ns::num_types]; + +void populate_acos_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = acos_fn_ns; + + using fn_ns::AcosContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(acos_contig_dispatch_vector); + + using fn_ns::AcosStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(acos_strided_dispatch_vector); + + using fn_ns::AcosTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(acos_output_typeid_vector); +}; + +} // namespace impl + +void init_acos(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_acos_dispatch_vectors(); + using impl::acos_contig_dispatch_vector; + using impl::acos_output_typeid_vector; + using impl::acos_strided_dispatch_vector; + + auto acos_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, acos_output_typeid_vector, + acos_contig_dispatch_vector, acos_strided_dispatch_vector); + }; + m.def("_acos", acos_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto acos_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, acos_output_typeid_vector); + }; + m.def("_acos_result_type", acos_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp new file mode 100644 index 0000000000..3a43d4087c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acos.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_acos(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp new file mode 100644 index 0000000000..526bd44f12 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "acosh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/acosh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U03: ==== ACOSH (x) +namespace impl +{ + +namespace acosh_fn_ns = dpctl::tensor::kernels::acosh; + +static unary_contig_impl_fn_ptr_t + acosh_contig_dispatch_vector[td_ns::num_types]; +static int acosh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + acosh_strided_dispatch_vector[td_ns::num_types]; + +void populate_acosh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = acosh_fn_ns; + + using fn_ns::AcoshContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(acosh_contig_dispatch_vector); + + using fn_ns::AcoshStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(acosh_strided_dispatch_vector); + + using fn_ns::AcoshTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(acosh_output_typeid_vector); +}; + +} // namespace impl + +void init_acosh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_acosh_dispatch_vectors(); + using impl::acosh_contig_dispatch_vector; + using impl::acosh_output_typeid_vector; + using impl::acosh_strided_dispatch_vector; + + auto acosh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, acosh_output_typeid_vector, + acosh_contig_dispatch_vector, acosh_strided_dispatch_vector); + }; + m.def("_acosh", acosh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto acosh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + acosh_output_typeid_vector); + }; + m.def("_acosh_result_type", acosh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp new file mode 100644 index 0000000000..dd13ba886c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_acosh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp new file mode 100644 index 0000000000..247b8e0283 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp @@ -0,0 +1,229 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "add.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/add.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B01: ===== ADD (x1, x2) +namespace impl +{ + +namespace add_fn_ns = dpctl::tensor::kernels::add; + +static binary_contig_impl_fn_ptr_t add_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int add_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + add_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// add(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + add_contig_matrix_contig_row_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +// add(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + add_contig_row_contig_matrix_broadcast_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + add_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + add_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + add_inplace_row_matrix_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_add_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = add_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::AddTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(add_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::AddStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(add_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::AddContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(add_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::AddContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + AddContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + add_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::AddContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + AddContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + add_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::AddInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(add_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::AddInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(add_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::AddInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(add_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_add(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_add_dispatch_tables(); + using impl::add_contig_dispatch_table; + using impl::add_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::add_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::add_output_id_table; + using impl::add_strided_dispatch_table; + + auto add_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, add_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + add_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + add_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + add_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto add_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + add_output_id_table); + }; + m.def("_add", add_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_add_result_type", add_result_type_pyapi, ""); + + using impl::add_inplace_contig_dispatch_table; + using impl::add_inplace_row_matrix_dispatch_table; + using impl::add_inplace_strided_dispatch_table; + + auto add_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, add_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + add_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + add_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + add_inplace_row_matrix_dispatch_table); + }; + m.def("_add_inplace", add_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp new file mode 100644 index 0000000000..5f88bfaa04 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_add(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp new file mode 100644 index 0000000000..14ef5e2665 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "asin.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/asin.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U04: ==== ASIN (x) +namespace impl +{ + +namespace asin_fn_ns = dpctl::tensor::kernels::asin; + +static unary_contig_impl_fn_ptr_t asin_contig_dispatch_vector[td_ns::num_types]; +static int asin_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + asin_strided_dispatch_vector[td_ns::num_types]; + +void populate_asin_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = asin_fn_ns; + + using fn_ns::AsinContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(asin_contig_dispatch_vector); + + using fn_ns::AsinStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(asin_strided_dispatch_vector); + + using fn_ns::AsinTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(asin_output_typeid_vector); +}; + +} // namespace impl + +void init_asin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_asin_dispatch_vectors(); + using impl::asin_contig_dispatch_vector; + using impl::asin_output_typeid_vector; + using impl::asin_strided_dispatch_vector; + + auto asin_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, asin_output_typeid_vector, + asin_contig_dispatch_vector, asin_strided_dispatch_vector); + }; + m.def("_asin", asin_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto asin_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, asin_output_typeid_vector); + }; + m.def("_asin_result_type", asin_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp new file mode 100644 index 0000000000..0beed1d19c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asin.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_asin(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp new file mode 100644 index 0000000000..dd0b4e62f7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "asinh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/asinh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U05: ==== ASINH (x) +namespace impl +{ + +namespace asinh_fn_ns = dpctl::tensor::kernels::asinh; + +static unary_contig_impl_fn_ptr_t + asinh_contig_dispatch_vector[td_ns::num_types]; +static int asinh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + asinh_strided_dispatch_vector[td_ns::num_types]; + +void populate_asinh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = asinh_fn_ns; + + using fn_ns::AsinhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(asinh_contig_dispatch_vector); + + using fn_ns::AsinhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(asinh_strided_dispatch_vector); + + using fn_ns::AsinhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(asinh_output_typeid_vector); +}; + +} // namespace impl + +void init_asinh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_asinh_dispatch_vectors(); + using impl::asinh_contig_dispatch_vector; + using impl::asinh_output_typeid_vector; + using impl::asinh_strided_dispatch_vector; + + auto asinh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, asinh_output_typeid_vector, + asinh_contig_dispatch_vector, asinh_strided_dispatch_vector); + }; + m.def("_asinh", asinh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto asinh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + asinh_output_typeid_vector); + }; + m.def("_asinh_result_type", asinh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp new file mode 100644 index 0000000000..22cc37b2d8 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_asinh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp new file mode 100644 index 0000000000..81ff00c46a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "atan.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/atan.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U06: ==== ATAN (x) +namespace impl +{ + +namespace atan_fn_ns = dpctl::tensor::kernels::atan; + +static unary_contig_impl_fn_ptr_t atan_contig_dispatch_vector[td_ns::num_types]; +static int atan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + atan_strided_dispatch_vector[td_ns::num_types]; + +void populate_atan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = atan_fn_ns; + + using fn_ns::AtanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(atan_contig_dispatch_vector); + + using fn_ns::AtanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(atan_strided_dispatch_vector); + + using fn_ns::AtanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(atan_output_typeid_vector); +}; + +} // namespace impl + +void init_atan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atan_dispatch_vectors(); + using impl::atan_contig_dispatch_vector; + using impl::atan_output_typeid_vector; + using impl::atan_strided_dispatch_vector; + + auto atan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, atan_output_typeid_vector, + atan_contig_dispatch_vector, atan_strided_dispatch_vector); + }; + m.def("_atan", atan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto atan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, atan_output_typeid_vector); + }; + m.def("_atan_result_type", atan_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp new file mode 100644 index 0000000000..86df06699c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_atan(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp new file mode 100644 index 0000000000..d12a4ff540 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "atan2.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/atan2.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B02: ===== ATAN2 (x1, x2) +namespace impl +{ +namespace atan2_fn_ns = dpctl::tensor::kernels::atan2; + +static binary_contig_impl_fn_ptr_t + atan2_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int atan2_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + atan2_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_atan2_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = atan2_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::Atan2TypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(atan2_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::Atan2StridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(atan2_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::Atan2ContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(atan2_contig_dispatch_table); +}; + +} // namespace impl + +void init_atan2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atan2_dispatch_tables(); + using impl::atan2_contig_dispatch_table; + using impl::atan2_output_id_table; + using impl::atan2_strided_dispatch_table; + + auto atan2_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, atan2_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + atan2_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + atan2_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto atan2_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + atan2_output_id_table); + }; + m.def("_atan2", atan2_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_atan2_result_type", atan2_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp new file mode 100644 index 0000000000..f369d12208 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_atan2(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp new file mode 100644 index 0000000000..c42769b8d0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "atanh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/atanh.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U07: ==== ATANH (x) +namespace impl +{ + +namespace atanh_fn_ns = dpctl::tensor::kernels::atanh; + +static unary_contig_impl_fn_ptr_t + atanh_contig_dispatch_vector[td_ns::num_types]; +static int atanh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + atanh_strided_dispatch_vector[td_ns::num_types]; + +void populate_atanh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = atanh_fn_ns; + + using fn_ns::AtanhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(atanh_contig_dispatch_vector); + + using fn_ns::AtanhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(atanh_strided_dispatch_vector); + + using fn_ns::AtanhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(atanh_output_typeid_vector); +}; + +} // namespace impl + +void init_atanh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_atanh_dispatch_vectors(); + using impl::atanh_contig_dispatch_vector; + using impl::atanh_output_typeid_vector; + using impl::atanh_strided_dispatch_vector; + + auto atanh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, atanh_output_typeid_vector, + atanh_contig_dispatch_vector, atanh_strided_dispatch_vector); + }; + m.def("_atanh", atanh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto atanh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + atanh_output_typeid_vector); + }; + m.def("_atanh_result_type", atanh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp new file mode 100644 index 0000000000..ba2930d80e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_atanh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp new file mode 100644 index 0000000000..f86f5112cd --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_and.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_and.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B03: ===== BITWISE_AND (x1, x2) +namespace impl +{ +namespace bitwise_and_fn_ns = dpctl::tensor::kernels::bitwise_and; + +static binary_contig_impl_fn_ptr_t + bitwise_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_and_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_and_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_and_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_and_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_and_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseAndTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_and_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseAndStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_and_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseAndContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_and_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseAndInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_and_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseAndInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_and_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_and(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_and_dispatch_tables(); + using impl::bitwise_and_contig_dispatch_table; + using impl::bitwise_and_output_id_table; + using impl::bitwise_and_strided_dispatch_table; + + auto bitwise_and_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_and_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_and_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_and_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_and_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_and_output_id_table); + }; + m.def("_bitwise_and", bitwise_and_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_and_result_type", bitwise_and_result_type_pyapi, ""); + + using impl::bitwise_and_inplace_contig_dispatch_table; + using impl::bitwise_and_inplace_strided_dispatch_table; + + auto bitwise_and_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_and_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_and_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_and_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_and_inplace", bitwise_and_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp new file mode 100644 index 0000000000..682b337efd --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_and(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp new file mode 100644 index 0000000000..29a04cff38 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp @@ -0,0 +1,123 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_invert.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_invert.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U08: ===== BITWISE_INVERT (x) +namespace impl +{ + +namespace bitwise_invert_fn_ns = dpctl::tensor::kernels::bitwise_invert; + +static unary_contig_impl_fn_ptr_t + bitwise_invert_contig_dispatch_vector[td_ns::num_types]; +static int bitwise_invert_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + bitwise_invert_strided_dispatch_vector[td_ns::num_types]; + +void populate_bitwise_invert_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_invert_fn_ns; + + using fn_ns::BitwiseInvertContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(bitwise_invert_contig_dispatch_vector); + + using fn_ns::BitwiseInvertStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(bitwise_invert_strided_dispatch_vector); + + using fn_ns::BitwiseInvertTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(bitwise_invert_output_typeid_vector); +}; + +} // namespace impl + +void init_bitwise_invert(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_invert_dispatch_vectors(); + using impl::bitwise_invert_contig_dispatch_vector; + using impl::bitwise_invert_output_typeid_vector; + using impl::bitwise_invert_strided_dispatch_vector; + + auto bitwise_invert_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + bitwise_invert_output_typeid_vector, + bitwise_invert_contig_dispatch_vector, + bitwise_invert_strided_dispatch_vector); + }; + m.def("_bitwise_invert", bitwise_invert_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto bitwise_invert_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type( + dtype, bitwise_invert_output_typeid_vector); + }; + m.def("_bitwise_invert_result_type", bitwise_invert_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp new file mode 100644 index 0000000000..5b5d8398dc --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_invert(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp new file mode 100644 index 0000000000..7969bc4ffa --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp @@ -0,0 +1,200 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_left_shift.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_left_shift.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B04: ===== BITWISE_LEFT_SHIFT (x1, x2) +namespace impl +{ +namespace bitwise_left_shift_fn_ns = dpctl::tensor::kernels::bitwise_left_shift; + +static binary_contig_impl_fn_ptr_t + bitwise_left_shift_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int bitwise_left_shift_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_left_shift_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_left_shift_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_left_shift_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_left_shift_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_left_shift_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseLeftShiftTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_left_shift_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseLeftShiftStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_left_shift_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseLeftShiftContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_left_shift_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseLeftShiftInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + bitwise_left_shift_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseLeftShiftInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table( + bitwise_left_shift_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_left_shift(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_left_shift_dispatch_tables(); + using impl::bitwise_left_shift_contig_dispatch_table; + using impl::bitwise_left_shift_output_id_table; + using impl::bitwise_left_shift_strided_dispatch_table; + + auto bitwise_left_shift_pyapi = [&](const arrayT &src1, + const arrayT &src2, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, + bitwise_left_shift_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_left_shift_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_left_shift_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_left_shift_result_type_pyapi = + [&](const py::dtype &dtype1, const py::dtype &dtype2) { + return py_binary_ufunc_result_type( + dtype1, dtype2, bitwise_left_shift_output_id_table); + }; + m.def("_bitwise_left_shift", bitwise_left_shift_pyapi, "", + py::arg("src1"), py::arg("src2"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_bitwise_left_shift_result_type", + bitwise_left_shift_result_type_pyapi, ""); + + using impl::bitwise_left_shift_inplace_contig_dispatch_table; + using impl::bitwise_left_shift_inplace_strided_dispatch_table; + + auto bitwise_left_shift_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, + bitwise_left_shift_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_left_shift_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_left_shift_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_left_shift_inplace", bitwise_left_shift_inplace_pyapi, + "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp new file mode 100644 index 0000000000..9edcba43ab --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_left_shift(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp new file mode 100644 index 0000000000..33a57f907c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_or.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_or.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B05: ===== BITWISE_OR (x1, x2) +namespace impl +{ +namespace bitwise_or_fn_ns = dpctl::tensor::kernels::bitwise_or; + +static binary_contig_impl_fn_ptr_t + bitwise_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_or_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_or_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_or_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_or_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_or_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseOrTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_or_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseOrStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_or_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseOrContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_or_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseOrInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_or_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseOrInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_or_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_or(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_or_dispatch_tables(); + using impl::bitwise_or_contig_dispatch_table; + using impl::bitwise_or_output_id_table; + using impl::bitwise_or_strided_dispatch_table; + + auto bitwise_or_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_or_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_or_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_or_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_or_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_or_output_id_table); + }; + m.def("_bitwise_or", bitwise_or_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_or_result_type", bitwise_or_result_type_pyapi, ""); + + using impl::bitwise_or_inplace_contig_dispatch_table; + using impl::bitwise_or_inplace_strided_dispatch_table; + + auto bitwise_or_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_or_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_or_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_or_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_or_inplace", bitwise_or_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp new file mode 100644 index 0000000000..7603ed8277 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_or(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp new file mode 100644 index 0000000000..3847204b1f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp @@ -0,0 +1,201 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_right_shift.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_right_shift.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B06: ===== BITWISE_RIGHT_SHIFT (x1, x2) +namespace impl +{ +namespace bitwise_right_shift_fn_ns = + dpctl::tensor::kernels::bitwise_right_shift; + +static binary_contig_impl_fn_ptr_t + bitwise_right_shift_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int bitwise_right_shift_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_right_shift_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_right_shift_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_right_shift_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_right_shift_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_right_shift_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseRightShiftTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_right_shift_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseRightShiftStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_right_shift_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseRightShiftContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_right_shift_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseRightShiftInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table( + bitwise_right_shift_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseRightShiftInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table( + bitwise_right_shift_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_right_shift(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_right_shift_dispatch_tables(); + using impl::bitwise_right_shift_contig_dispatch_table; + using impl::bitwise_right_shift_output_id_table; + using impl::bitwise_right_shift_strided_dispatch_table; + + auto bitwise_right_shift_pyapi = [&](const arrayT &src1, + const arrayT &src2, + const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, + bitwise_right_shift_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_right_shift_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_right_shift_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_right_shift_result_type_pyapi = + [&](const py::dtype &dtype1, const py::dtype &dtype2) { + return py_binary_ufunc_result_type( + dtype1, dtype2, bitwise_right_shift_output_id_table); + }; + m.def("_bitwise_right_shift", bitwise_right_shift_pyapi, "", + py::arg("src1"), py::arg("src2"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_bitwise_right_shift_result_type", + bitwise_right_shift_result_type_pyapi, ""); + + using impl::bitwise_right_shift_inplace_contig_dispatch_table; + using impl::bitwise_right_shift_inplace_strided_dispatch_table; + + auto bitwise_right_shift_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, + bitwise_right_shift_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_right_shift_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_right_shift_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_right_shift_inplace", bitwise_right_shift_inplace_pyapi, + "", py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp new file mode 100644 index 0000000000..5ce2bca4e7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_right_shift(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp new file mode 100644 index 0000000000..71d606766f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "bitwise_xor.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/bitwise_xor.hpp" +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B07: ===== BITWISE_XOR (x1, x2) +namespace impl +{ +namespace bitwise_xor_fn_ns = dpctl::tensor::kernels::bitwise_xor; + +static binary_contig_impl_fn_ptr_t + bitwise_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int bitwise_xor_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + bitwise_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + bitwise_xor_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + bitwise_xor_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_bitwise_xor_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = bitwise_xor_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::BitwiseXorTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(bitwise_xor_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::BitwiseXorStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(bitwise_xor_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::BitwiseXorContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(bitwise_xor_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::BitwiseXorInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(bitwise_xor_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::BitwiseXorInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(bitwise_xor_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_bitwise_xor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_bitwise_xor_dispatch_tables(); + using impl::bitwise_xor_contig_dispatch_table; + using impl::bitwise_xor_output_id_table; + using impl::bitwise_xor_strided_dispatch_table; + + auto bitwise_xor_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, bitwise_xor_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + bitwise_xor_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + bitwise_xor_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto bitwise_xor_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + bitwise_xor_output_id_table); + }; + m.def("_bitwise_xor", bitwise_xor_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_bitwise_xor_result_type", bitwise_xor_result_type_pyapi, ""); + + using impl::bitwise_xor_inplace_contig_dispatch_table; + using impl::bitwise_xor_inplace_strided_dispatch_table; + + auto bitwise_xor_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, bitwise_xor_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + bitwise_xor_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + bitwise_xor_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_bitwise_xor_inplace", bitwise_xor_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp new file mode 100644 index 0000000000..7b092aadda --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_bitwise_xor(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp new file mode 100644 index 0000000000..b42f234c0d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "cbrt.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/cbrt.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U37: ==== CBRT (x) +namespace impl +{ + +namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt; + +static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types]; +static int cbrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cbrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_cbrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cbrt_fn_ns; + + using fn_ns::CbrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector); + + using fn_ns::CbrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector); + + using fn_ns::CbrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cbrt_output_typeid_vector); +}; + +} // namespace impl + +void init_cbrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cbrt_dispatch_vectors(); + using impl::cbrt_contig_dispatch_vector; + using impl::cbrt_output_typeid_vector; + using impl::cbrt_strided_dispatch_vector; + + auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cbrt_output_typeid_vector, + cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector); + }; + m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector); + }; + m.def("_cbrt_result_type", cbrt_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp new file mode 100644 index 0000000000..74da1de81a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cbrt(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp new file mode 100644 index 0000000000..f1bb362c5b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "ceil.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/ceil.hpp" +#include "kernels/elementwise_functions/common.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U09: ==== CEIL (x) +namespace impl +{ + +namespace ceil_fn_ns = dpctl::tensor::kernels::ceil; + +static unary_contig_impl_fn_ptr_t ceil_contig_dispatch_vector[td_ns::num_types]; +static int ceil_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + ceil_strided_dispatch_vector[td_ns::num_types]; + +void populate_ceil_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = ceil_fn_ns; + + using fn_ns::CeilContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(ceil_contig_dispatch_vector); + + using fn_ns::CeilStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(ceil_strided_dispatch_vector); + + using fn_ns::CeilTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(ceil_output_typeid_vector); +}; + +} // namespace impl + +void init_ceil(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_ceil_dispatch_vectors(); + using impl::ceil_contig_dispatch_vector; + using impl::ceil_output_typeid_vector; + using impl::ceil_strided_dispatch_vector; + + auto ceil_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, ceil_output_typeid_vector, + ceil_contig_dispatch_vector, ceil_strided_dispatch_vector); + }; + m.def("_ceil", ceil_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto ceil_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, ceil_output_typeid_vector); + }; + m.def("_ceil_result_type", ceil_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp new file mode 100644 index 0000000000..4a6caf999b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.hpp @@ -0,0 +1,44 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_ceil(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp new file mode 100644 index 0000000000..cac84e63fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "conj.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/conj.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U10: ==== CONJ (x) +namespace impl +{ + +namespace conj_fn_ns = dpctl::tensor::kernels::conj; + +static unary_contig_impl_fn_ptr_t conj_contig_dispatch_vector[td_ns::num_types]; +static int conj_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + conj_strided_dispatch_vector[td_ns::num_types]; + +void populate_conj_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = conj_fn_ns; + + using fn_ns::ConjContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(conj_contig_dispatch_vector); + + using fn_ns::ConjStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(conj_strided_dispatch_vector); + + using fn_ns::ConjTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(conj_output_typeid_vector); +}; + +} // namespace impl + +void init_conj(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_conj_dispatch_vectors(); + using impl::conj_contig_dispatch_vector; + using impl::conj_output_typeid_vector; + using impl::conj_strided_dispatch_vector; + + auto conj_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, conj_output_typeid_vector, + conj_contig_dispatch_vector, conj_strided_dispatch_vector); + }; + m.def("_conj", conj_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto conj_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, conj_output_typeid_vector); + }; + m.def("_conj_result_type", conj_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp new file mode 100644 index 0000000000..33d9993019 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/conj.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_conj(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp new file mode 100644 index 0000000000..6a887e0345 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "copysign.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/copysign.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B25: ===== COPYSIGN (x1, x2) +namespace impl +{ +namespace copysign_fn_ns = dpctl::tensor::kernels::copysign; + +static binary_contig_impl_fn_ptr_t + copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int copysign_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_copysign_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = copysign_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::CopysignTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(copysign_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::CopysignStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(copysign_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::CopysignContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(copysign_contig_dispatch_table); +}; + +} // namespace impl + +void init_copysign(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_copysign_dispatch_tables(); + using impl::copysign_contig_dispatch_table; + using impl::copysign_output_id_table; + using impl::copysign_strided_dispatch_table; + + auto copysign_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, copysign_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + copysign_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + copysign_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto copysign_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + copysign_output_id_table); + }; + m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_copysign_result_type", copysign_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp new file mode 100644 index 0000000000..d22cbdb0f0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_copysign(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp new file mode 100644 index 0000000000..1986610510 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "cos.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/cos.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U11: ==== COS (x) +namespace impl +{ + +namespace cos_fn_ns = dpctl::tensor::kernels::cos; + +static unary_contig_impl_fn_ptr_t cos_contig_dispatch_vector[td_ns::num_types]; +static int cos_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cos_strided_dispatch_vector[td_ns::num_types]; + +void populate_cos_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cos_fn_ns; + + using fn_ns::CosContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cos_contig_dispatch_vector); + + using fn_ns::CosStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cos_strided_dispatch_vector); + + using fn_ns::CosTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cos_output_typeid_vector); +}; + +} // namespace impl + +void init_cos(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cos_dispatch_vectors(); + using impl::cos_contig_dispatch_vector; + using impl::cos_output_typeid_vector; + using impl::cos_strided_dispatch_vector; + + auto cos_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cos_output_typeid_vector, + cos_contig_dispatch_vector, cos_strided_dispatch_vector); + }; + m.def("_cos", cos_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cos_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cos_output_typeid_vector); + }; + m.def("_cos_result_type", cos_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp new file mode 100644 index 0000000000..1753058024 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cos.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cos(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp new file mode 100644 index 0000000000..0bb74df979 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "cosh.hpp" +#include "elementwise_functions.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/cosh.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U12: ==== COSH (x) +namespace impl +{ + +namespace cosh_fn_ns = dpctl::tensor::kernels::cosh; + +static unary_contig_impl_fn_ptr_t cosh_contig_dispatch_vector[td_ns::num_types]; +static int cosh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + cosh_strided_dispatch_vector[td_ns::num_types]; + +void populate_cosh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = cosh_fn_ns; + + using fn_ns::CoshContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(cosh_contig_dispatch_vector); + + using fn_ns::CoshStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(cosh_strided_dispatch_vector); + + using fn_ns::CoshTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(cosh_output_typeid_vector); +}; + +} // namespace impl + +void init_cosh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_cosh_dispatch_vectors(); + using impl::cosh_contig_dispatch_vector; + using impl::cosh_output_typeid_vector; + using impl::cosh_strided_dispatch_vector; + + auto cosh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, cosh_output_typeid_vector, + cosh_contig_dispatch_vector, cosh_strided_dispatch_vector); + }; + m.def("_cosh", cosh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto cosh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, cosh_output_typeid_vector); + }; + m.def("_cosh_result_type", cosh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp new file mode 100644 index 0000000000..c1eba05ea5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_cosh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp new file mode 100644 index 0000000000..751e44ff55 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.cpp @@ -0,0 +1,181 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include + +#include "abs.hpp" +#include "acos.hpp" +#include "acosh.hpp" +#include "add.hpp" +#include "asin.hpp" +#include "asinh.hpp" +#include "atan.hpp" +#include "atan2.hpp" +#include "atanh.hpp" +#include "bitwise_and.hpp" +#include "bitwise_invert.hpp" +#include "bitwise_left_shift.hpp" +#include "bitwise_or.hpp" +#include "bitwise_right_shift.hpp" +#include "bitwise_xor.hpp" +#include "cbrt.hpp" +#include "ceil.hpp" +#include "conj.hpp" +#include "copysign.hpp" +#include "cos.hpp" +#include "cosh.hpp" +#include "equal.hpp" +#include "exp.hpp" +#include "exp2.hpp" +#include "expm1.hpp" +#include "floor.hpp" +#include "floor_divide.hpp" +#include "greater.hpp" +#include "greater_equal.hpp" +#include "hypot.hpp" +#include "imag.hpp" +#include "isfinite.hpp" +#include "isinf.hpp" +#include "isnan.hpp" +#include "less.hpp" +#include "less_equal.hpp" +#include "log.hpp" +#include "log10.hpp" +#include "log1p.hpp" +#include "log2.hpp" +#include "logaddexp.hpp" +#include "logical_and.hpp" +#include "logical_not.hpp" +#include "logical_or.hpp" +#include "logical_xor.hpp" +#include "maximum.hpp" +#include "minimum.hpp" +#include "multiply.hpp" +#include "negative.hpp" +#include "not_equal.hpp" +#include "positive.hpp" +#include "pow.hpp" +#include "proj.hpp" +#include "real.hpp" +#include "remainder.hpp" +#include "round.hpp" +#include "rsqrt.hpp" +#include "sign.hpp" +#include "signbit.hpp" +#include "sin.hpp" +#include "sinh.hpp" +#include "sqrt.hpp" +#include "square.hpp" +#include "subtract.hpp" +#include "tan.hpp" +#include "tanh.hpp" +#include "true_divide.hpp" +#include "trunc.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +/*! @brief Add elementwise functions to Python module */ +void init_elementwise_functions(py::module_ m) +{ + init_abs(m); + init_acos(m); + init_acosh(m); + init_add(m); + init_asin(m); + init_asinh(m); + init_atan(m); + init_atan2(m); + init_atanh(m); + init_bitwise_and(m); + init_bitwise_invert(m); + init_bitwise_left_shift(m); + init_bitwise_or(m); + init_bitwise_right_shift(m); + init_bitwise_xor(m); + init_cbrt(m); + init_ceil(m); + init_conj(m); + init_copysign(m); + init_cos(m); + init_cosh(m); + init_divide(m); + init_equal(m); + init_exp(m); + init_exp2(m); + init_expm1(m); + init_floor(m); + init_floor_divide(m); + init_greater(m); + init_greater_equal(m); + init_hypot(m); + init_imag(m); + init_isfinite(m); + init_isinf(m); + init_isnan(m); + init_less(m); + init_less_equal(m); + init_log(m); + init_log10(m); + init_log1p(m); + init_log2(m); + init_logaddexp(m); + init_logical_and(m); + init_logical_not(m); + init_logical_or(m); + init_logical_xor(m); + init_maximum(m); + init_minimum(m); + init_multiply(m); + init_negative(m); + init_not_equal(m); + init_positive(m); + init_pow(m); + init_proj(m); + init_real(m); + init_remainder(m); + init_round(m); + init_rsqrt(m); + init_sign(m); + init_signbit(m); + init_sin(m); + init_sinh(m); + init_sqrt(m); + init_square(m); + init_subtract(m); + init_tan(m); + init_tanh(m); + init_trunc(m); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp new file mode 100644 index 0000000000..ef9182f9a2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_common.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_elementwise_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp similarity index 97% rename from dpctl/tensor/libtensor/source/elementwise_functions.hpp rename to dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index 523e4259c3..6817a3541c 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -22,7 +22,6 @@ /// This file defines functions of dpctl.tensor._tensor_impl extensions, /// specifically functions for elementwise operations. //===----------------------------------------------------------------------===// - #pragma once #include "dpctl4pybind11.hpp" @@ -30,14 +29,17 @@ #include #include #include -#include #include +#include "elementwise_functions_type_utils.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch.hpp" +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + namespace dpctl { namespace tensor @@ -45,11 +47,7 @@ namespace tensor namespace py_internal { -namespace td_ns = dpctl::tensor::type_dispatch; - -extern py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t); -extern int _result_typeid(int arg_typeid, const int *fn_output_id); - +/*! @brief Template implementing Python API for unary elementwise functions */ template @@ -251,6 +249,8 @@ py_unary_ufunc(const dpctl::tensor::usm_ndarray &src, strided_fn_ev); } +/*! @brief Template implementing Python API for querying of type support by + * unary elementwise functions */ template py::object py_unary_ufunc_result_type(const py::dtype &input_dtype, const output_typesT &output_types) @@ -266,6 +266,7 @@ py::object py_unary_ufunc_result_type(const py::dtype &input_dtype, throw py::value_error(e.what()); } + using dpctl::tensor::py_internal::type_utils::_result_typeid; int dst_typeid = _result_typeid(src_typeid, output_types); if (dst_typeid < 0) { @@ -273,8 +274,9 @@ py::object py_unary_ufunc_result_type(const py::dtype &input_dtype, return py::cast(res); } else { - auto dst_typenum_t = static_cast(dst_typeid); + using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum; + auto dst_typenum_t = static_cast(dst_typeid); auto dt = _dtype_from_typenum(dst_typenum_t); return py::cast(dt); @@ -292,6 +294,8 @@ bool isEqual(Container const &c, std::initializer_list const &l) } } // namespace +/*! @brief Template implementing Python API for binary elementwise + * functions */ template py_binary_ufunc( strided_fn_ev); } +/*! @brief Type querying for binary elementwise functions */ template py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype, const py::dtype &input2_dtype, @@ -590,8 +595,9 @@ py::object py_binary_ufunc_result_type(const py::dtype &input1_dtype, return py::cast(res); } else { - auto dst_typenum_t = static_cast(dst_typeid); + using dpctl::tensor::py_internal::type_utils::_dtype_from_typenum; + auto dst_typenum_t = static_cast(dst_typeid); auto dt = _dtype_from_typenum(dst_typenum_t); return py::cast(dt); @@ -825,8 +831,6 @@ py_binary_inplace_ufunc(const dpctl::tensor::usm_ndarray &lhs, strided_fn_ev); } -extern void init_elementwise_functions(py::module_ m); - } // namespace py_internal } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp new file mode 100644 index 0000000000..473048e8fa --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp @@ -0,0 +1,95 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions for looking of supported types in elementwise +/// functions. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "elementwise_functions_type_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ +namespace type_utils +{ + +py::dtype _dtype_from_typenum(td_ns::typenum_t dst_typenum_t) +{ + switch (dst_typenum_t) { + case td_ns::typenum_t::BOOL: + return py::dtype("?"); + case td_ns::typenum_t::INT8: + return py::dtype("i1"); + case td_ns::typenum_t::UINT8: + return py::dtype("u1"); + case td_ns::typenum_t::INT16: + return py::dtype("i2"); + case td_ns::typenum_t::UINT16: + return py::dtype("u2"); + case td_ns::typenum_t::INT32: + return py::dtype("i4"); + case td_ns::typenum_t::UINT32: + return py::dtype("u4"); + case td_ns::typenum_t::INT64: + return py::dtype("i8"); + case td_ns::typenum_t::UINT64: + return py::dtype("u8"); + case td_ns::typenum_t::HALF: + return py::dtype("f2"); + case td_ns::typenum_t::FLOAT: + return py::dtype("f4"); + case td_ns::typenum_t::DOUBLE: + return py::dtype("f8"); + case td_ns::typenum_t::CFLOAT: + return py::dtype("c8"); + case td_ns::typenum_t::CDOUBLE: + return py::dtype("c16"); + default: + throw py::value_error("Unrecognized dst_typeid"); + } +} + +int _result_typeid(int arg_typeid, const int *fn_output_id) +{ + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types) { + throw py::value_error("Input typeid " + std::to_string(arg_typeid) + + " is outside of expected bounds."); + } + + return fn_output_id[arg_typeid]; +} + +} // namespace type_utils +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp new file mode 100644 index 0000000000..6dac195dc2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp @@ -0,0 +1,56 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares functions for looking of supported types in elementwise +/// functions. +//===----------------------------------------------------------------------===// + +#pragma once +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ +namespace type_utils +{ + +/*! @brief Produce dtype from a type number */ +extern py::dtype _dtype_from_typenum(td_ns::typenum_t); + +/*! @brief Lookup typeid of the result from typeid of + * argument and the mapping table */ +extern int _result_typeid(int, const int *); + +} // namespace type_utils +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp new file mode 100644 index 0000000000..f36ec1b446 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B09: ===== EQUAL (x1, x2) +namespace impl +{ +namespace equal_fn_ns = dpctl::tensor::kernels::equal; + +static binary_contig_impl_fn_ptr_t + equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::EqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::EqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::EqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_equal_dispatch_tables(); + using impl::equal_contig_dispatch_table; + using impl::equal_output_id_table; + using impl::equal_strided_dispatch_table; + + auto equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + equal_output_id_table); + }; + m.def("_equal", equal_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_equal_result_type", equal_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp new file mode 100644 index 0000000000..21ac4ad6b4 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp new file mode 100644 index 0000000000..51ccaaac70 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "exp.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/exp.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U13: ==== EXP (x) +namespace impl +{ + +namespace exp_fn_ns = dpctl::tensor::kernels::exp; + +static unary_contig_impl_fn_ptr_t exp_contig_dispatch_vector[td_ns::num_types]; +static int exp_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + exp_strided_dispatch_vector[td_ns::num_types]; + +void populate_exp_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = exp_fn_ns; + + using fn_ns::ExpContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(exp_contig_dispatch_vector); + + using fn_ns::ExpStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(exp_strided_dispatch_vector); + + using fn_ns::ExpTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(exp_output_typeid_vector); +}; + +} // namespace impl + +void init_exp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_exp_dispatch_vectors(); + using impl::exp_contig_dispatch_vector; + using impl::exp_output_typeid_vector; + using impl::exp_strided_dispatch_vector; + + auto exp_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, exp_output_typeid_vector, + exp_contig_dispatch_vector, exp_strided_dispatch_vector); + }; + m.def("_exp", exp_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto exp_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, exp_output_typeid_vector); + }; + m.def("_exp_result_type", exp_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp new file mode 100644 index 0000000000..7227f0a2dc --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_exp(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp new file mode 100644 index 0000000000..438ad0800e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "exp2.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/exp2.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U38: ==== EXP2 (x) +namespace impl +{ + +namespace exp2_fn_ns = dpctl::tensor::kernels::exp2; + +static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types]; +static int exp2_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + exp2_strided_dispatch_vector[td_ns::num_types]; + +void populate_exp2_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = exp2_fn_ns; + + using fn_ns::Exp2ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector); + + using fn_ns::Exp2StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector); + + using fn_ns::Exp2TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(exp2_output_typeid_vector); +}; + +} // namespace impl + +void init_exp2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_exp2_dispatch_vectors(); + using impl::exp2_contig_dispatch_vector; + using impl::exp2_output_typeid_vector; + using impl::exp2_strided_dispatch_vector; + + auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, exp2_output_typeid_vector, + exp2_contig_dispatch_vector, exp2_strided_dispatch_vector); + }; + m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto exp2_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector); + }; + m.def("_exp2_result_type", exp2_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp new file mode 100644 index 0000000000..be041e1f8d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_exp2(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp new file mode 100644 index 0000000000..3b9332c4f1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "expm1.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/expm1.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U14: ==== EXPM1 (x) +namespace impl +{ + +namespace expm1_fn_ns = dpctl::tensor::kernels::expm1; + +static unary_contig_impl_fn_ptr_t + expm1_contig_dispatch_vector[td_ns::num_types]; +static int expm1_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + expm1_strided_dispatch_vector[td_ns::num_types]; + +void populate_expm1_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = expm1_fn_ns; + + using fn_ns::Expm1ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(expm1_contig_dispatch_vector); + + using fn_ns::Expm1StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(expm1_strided_dispatch_vector); + + using fn_ns::Expm1TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(expm1_output_typeid_vector); +}; + +} // namespace impl + +void init_expm1(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_expm1_dispatch_vectors(); + using impl::expm1_contig_dispatch_vector; + using impl::expm1_output_typeid_vector; + using impl::expm1_strided_dispatch_vector; + + auto expm1_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, expm1_output_typeid_vector, + expm1_contig_dispatch_vector, expm1_strided_dispatch_vector); + }; + m.def("_expm1", expm1_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto expm1_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + expm1_output_typeid_vector); + }; + m.def("_expm1_result_type", expm1_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp new file mode 100644 index 0000000000..6e39644835 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_expm1(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp new file mode 100644 index 0000000000..9ccf89f13a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "floor.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/floor.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U15: ==== FLOOR (x) +namespace impl +{ + +namespace floor_fn_ns = dpctl::tensor::kernels::floor; + +static unary_contig_impl_fn_ptr_t + floor_contig_dispatch_vector[td_ns::num_types]; +static int floor_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + floor_strided_dispatch_vector[td_ns::num_types]; + +void populate_floor_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = floor_fn_ns; + + using fn_ns::FloorContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(floor_contig_dispatch_vector); + + using fn_ns::FloorStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(floor_strided_dispatch_vector); + + using fn_ns::FloorTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(floor_output_typeid_vector); +}; + +} // namespace impl + +void init_floor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_floor_dispatch_vectors(); + using impl::floor_contig_dispatch_vector; + using impl::floor_output_typeid_vector; + using impl::floor_strided_dispatch_vector; + + auto floor_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, floor_output_typeid_vector, + floor_contig_dispatch_vector, floor_strided_dispatch_vector); + }; + m.def("_floor", floor_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto floor_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + floor_output_typeid_vector); + }; + m.def("_floor_result_type", floor_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp new file mode 100644 index 0000000000..b742b058ad --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_floor(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp new file mode 100644 index 0000000000..e75fc56c67 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "floor_divide.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/floor_divide.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B10: ===== FLOOR_DIVIDE (x1, x2) +namespace impl +{ +namespace floor_divide_fn_ns = dpctl::tensor::kernels::floor_divide; + +static binary_contig_impl_fn_ptr_t + floor_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int floor_divide_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + floor_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + floor_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + floor_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_floor_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = floor_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::FloorDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(floor_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::FloorDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(floor_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::FloorDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(floor_divide_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::FloorDivideInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(floor_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::FloorDivideInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(floor_divide_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_floor_divide(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_floor_divide_dispatch_tables(); + using impl::floor_divide_contig_dispatch_table; + using impl::floor_divide_output_id_table; + using impl::floor_divide_strided_dispatch_table; + + auto floor_divide_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, floor_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + floor_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + floor_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto floor_divide_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + floor_divide_output_id_table); + }; + m.def("_floor_divide", floor_divide_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_floor_divide_result_type", floor_divide_result_type_pyapi, ""); + + using impl::floor_divide_inplace_contig_dispatch_table; + using impl::floor_divide_inplace_strided_dispatch_table; + + auto floor_divide_inplace_pyapi = + [&](const arrayT &src, const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, floor_divide_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + floor_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + floor_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_floor_divide_inplace", floor_divide_inplace_pyapi, "", + py::arg("lhs"), py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp new file mode 100644 index 0000000000..c7f0d40dcc --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_floor_divide(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp new file mode 100644 index 0000000000..f79102df47 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "greater.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/greater.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B11: ===== GREATER (x1, x2) +namespace impl +{ +namespace greater_fn_ns = dpctl::tensor::kernels::greater; + +static binary_contig_impl_fn_ptr_t + greater_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int greater_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + greater_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_greater_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = greater_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::GreaterTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(greater_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::GreaterStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(greater_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::GreaterContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(greater_contig_dispatch_table); +}; + +} // namespace impl + +void init_greater(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_greater_dispatch_tables(); + using impl::greater_contig_dispatch_table; + using impl::greater_output_id_table; + using impl::greater_strided_dispatch_table; + + auto greater_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, greater_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + greater_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + greater_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto greater_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + greater_output_id_table); + }; + m.def("_greater", greater_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_greater_result_type", greater_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp new file mode 100644 index 0000000000..ba8dc57bb0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_greater(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp new file mode 100644 index 0000000000..005679c3fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp @@ -0,0 +1,141 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "greater_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/greater_equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B12: ===== GREATER_EQUAL (x1, x2) +namespace impl +{ +namespace greater_equal_fn_ns = dpctl::tensor::kernels::greater_equal; + +static binary_contig_impl_fn_ptr_t + greater_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int greater_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + greater_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_greater_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = greater_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::GreaterEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(greater_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::GreaterEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(greater_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::GreaterEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(greater_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_greater_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_greater_equal_dispatch_tables(); + using impl::greater_equal_contig_dispatch_table; + using impl::greater_equal_output_id_table; + using impl::greater_equal_strided_dispatch_table; + + auto greater_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, greater_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + greater_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + greater_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto greater_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + greater_equal_output_id_table); + }; + m.def("_greater_equal", greater_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_greater_equal_result_type", greater_equal_result_type_pyapi, + ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp new file mode 100644 index 0000000000..2cf116566e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_greater_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp new file mode 100644 index 0000000000..2442710198 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "hypot.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/hypot.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B24: ===== HYPOT (x1, x2) +namespace impl +{ +namespace hypot_fn_ns = dpctl::tensor::kernels::hypot; + +static binary_contig_impl_fn_ptr_t + hypot_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int hypot_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + hypot_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_hypot_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = hypot_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::HypotTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(hypot_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::HypotStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(hypot_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::HypotContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(hypot_contig_dispatch_table); +}; + +} // namespace impl + +void init_hypot(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_hypot_dispatch_tables(); + using impl::hypot_contig_dispatch_table; + using impl::hypot_output_id_table; + using impl::hypot_strided_dispatch_table; + + auto hypot_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, hypot_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + hypot_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + hypot_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto hypot_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + hypot_output_id_table); + }; + m.def("_hypot", hypot_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_hypot_result_type", hypot_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp new file mode 100644 index 0000000000..2d154917ea --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_hypot(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp new file mode 100644 index 0000000000..4012b9206f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "imag.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/imag.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U16: ==== IMAG (x) +namespace impl +{ + +namespace imag_fn_ns = dpctl::tensor::kernels::imag; + +static unary_contig_impl_fn_ptr_t imag_contig_dispatch_vector[td_ns::num_types]; +static int imag_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + imag_strided_dispatch_vector[td_ns::num_types]; + +void populate_imag_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = imag_fn_ns; + + using fn_ns::ImagContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(imag_contig_dispatch_vector); + + using fn_ns::ImagStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(imag_strided_dispatch_vector); + + using fn_ns::ImagTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(imag_output_typeid_vector); +}; + +} // namespace impl + +void init_imag(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_imag_dispatch_vectors(); + using impl::imag_contig_dispatch_vector; + using impl::imag_output_typeid_vector; + using impl::imag_strided_dispatch_vector; + + auto imag_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, imag_output_typeid_vector, + imag_contig_dispatch_vector, imag_strided_dispatch_vector); + }; + m.def("_imag", imag_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto imag_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, imag_output_typeid_vector); + }; + m.def("_imag_result_type", imag_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp new file mode 100644 index 0000000000..ffac3f2465 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/imag.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_isfinite(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp new file mode 100644 index 0000000000..73a2be4010 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isfinite.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isfinite.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U17: ==== ISFINITE (x) +namespace impl +{ + +namespace isfinite_fn_ns = dpctl::tensor::kernels::isfinite; + +static unary_contig_impl_fn_ptr_t + isfinite_contig_dispatch_vector[td_ns::num_types]; +static int isfinite_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isfinite_strided_dispatch_vector[td_ns::num_types]; + +void populate_isfinite_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isfinite_fn_ns; + + using fn_ns::IsFiniteContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isfinite_contig_dispatch_vector); + + using fn_ns::IsFiniteStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isfinite_strided_dispatch_vector); + + using fn_ns::IsFiniteTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isfinite_output_typeid_vector); +}; + +} // namespace impl + +void init_isfinite(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isfinite_dispatch_vectors(); + using impl::isfinite_contig_dispatch_vector; + using impl::isfinite_output_typeid_vector; + using impl::isfinite_strided_dispatch_vector; + + auto isfinite_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + isfinite_output_typeid_vector, + isfinite_contig_dispatch_vector, + isfinite_strided_dispatch_vector); + }; + m.def("_isfinite", isfinite_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isfinite_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isfinite_output_typeid_vector); + }; + m.def("_isfinite_result_type", isfinite_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp new file mode 100644 index 0000000000..fd7508792b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_imag(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp new file mode 100644 index 0000000000..2600fe4f74 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isinf.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isinf.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U18: ==== ISINF (x) +namespace impl +{ + +namespace isinf_fn_ns = dpctl::tensor::kernels::isinf; + +static unary_contig_impl_fn_ptr_t + isinf_contig_dispatch_vector[td_ns::num_types]; +static int isinf_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isinf_strided_dispatch_vector[td_ns::num_types]; + +void populate_isinf_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isinf_fn_ns; + + using fn_ns::IsInfContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isinf_contig_dispatch_vector); + + using fn_ns::IsInfStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isinf_strided_dispatch_vector); + + using fn_ns::IsInfTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isinf_output_typeid_vector); +}; + +} // namespace impl + +void init_isinf(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isinf_dispatch_vectors(); + using impl::isinf_contig_dispatch_vector; + using impl::isinf_output_typeid_vector; + using impl::isinf_strided_dispatch_vector; + + auto isinf_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isinf_output_typeid_vector, + isinf_contig_dispatch_vector, isinf_strided_dispatch_vector); + }; + m.def("_isinf", isinf_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isinf_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isinf_output_typeid_vector); + }; + m.def("_isinf_result_type", isinf_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp new file mode 100644 index 0000000000..8c3cd51c91 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_isinf(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp new file mode 100644 index 0000000000..b75618c5e0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "isnan.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/isnan.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U19: ==== ISNAN (x) +namespace impl +{ + +namespace isnan_fn_ns = dpctl::tensor::kernels::isnan; + +static unary_contig_impl_fn_ptr_t + isnan_contig_dispatch_vector[td_ns::num_types]; +static int isnan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + isnan_strided_dispatch_vector[td_ns::num_types]; + +void populate_isnan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = isnan_fn_ns; + + using fn_ns::IsNanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(isnan_contig_dispatch_vector); + + using fn_ns::IsNanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(isnan_strided_dispatch_vector); + + using fn_ns::IsNanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(isnan_output_typeid_vector); +}; + +} // namespace impl + +void init_isnan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_isnan_dispatch_vectors(); + using impl::isnan_contig_dispatch_vector; + using impl::isnan_output_typeid_vector; + using impl::isnan_strided_dispatch_vector; + + auto isnan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, isnan_output_typeid_vector, + isnan_contig_dispatch_vector, isnan_strided_dispatch_vector); + }; + m.def("_isnan", isnan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto isnan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + isnan_output_typeid_vector); + }; + m.def("_isnan_result_type", isnan_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp new file mode 100644 index 0000000000..df1f41d47f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_isnan(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp new file mode 100644 index 0000000000..c34122d862 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "less.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/less.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B13: ===== LESS (x1, x2) +namespace impl +{ +namespace less_fn_ns = dpctl::tensor::kernels::less; + +static binary_contig_impl_fn_ptr_t less_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int less_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + less_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_less_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = less_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LessTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(less_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LessStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(less_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LessContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(less_contig_dispatch_table); +}; + +} // namespace impl + +void init_less(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_less_dispatch_tables(); + using impl::less_contig_dispatch_table; + using impl::less_output_id_table; + using impl::less_strided_dispatch_table; + + auto less_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, less_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + less_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + less_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto less_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + less_output_id_table); + }; + m.def("_less", less_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_less_result_type", less_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp new file mode 100644 index 0000000000..dada4b4be7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_less(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp new file mode 100644 index 0000000000..712b30d902 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "less_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/less_equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B14: ===== LESS_EQUAL (x1, x2) +namespace impl +{ +namespace less_equal_fn_ns = dpctl::tensor::kernels::less_equal; + +static binary_contig_impl_fn_ptr_t + less_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int less_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + less_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_less_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = less_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LessEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(less_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LessEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(less_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LessEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(less_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_less_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_less_equal_dispatch_tables(); + using impl::less_equal_contig_dispatch_table; + using impl::less_equal_output_id_table; + using impl::less_equal_strided_dispatch_table; + + auto less_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, less_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + less_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + less_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto less_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + less_equal_output_id_table); + }; + m.def("_less_equal", less_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_less_equal_result_type", less_equal_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp new file mode 100644 index 0000000000..e52ee3b940 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_less_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp new file mode 100644 index 0000000000..f73b9e2414 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U20: ==== LOG (x) +namespace impl +{ + +namespace log_fn_ns = dpctl::tensor::kernels::log; + +static unary_contig_impl_fn_ptr_t log_contig_dispatch_vector[td_ns::num_types]; +static int log_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log_strided_dispatch_vector[td_ns::num_types]; + +void populate_log_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log_fn_ns; + + using fn_ns::LogContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log_contig_dispatch_vector); + + using fn_ns::LogStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log_strided_dispatch_vector); + + using fn_ns::LogTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log_output_typeid_vector); +}; + +} // namespace impl + +void init_log(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log_dispatch_vectors(); + using impl::log_contig_dispatch_vector; + using impl::log_output_typeid_vector; + using impl::log_strided_dispatch_vector; + + auto log_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log_output_typeid_vector, + log_contig_dispatch_vector, log_strided_dispatch_vector); + }; + m.def("_log", log_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, log_output_typeid_vector); + }; + m.def("_log_result_type", log_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp new file mode 100644 index 0000000000..1ca152d174 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp new file mode 100644 index 0000000000..566dfcbcf7 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log10.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log10.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U23: ==== LOG10 (x) +namespace impl +{ + +namespace log10_fn_ns = dpctl::tensor::kernels::log10; + +static unary_contig_impl_fn_ptr_t + log10_contig_dispatch_vector[td_ns::num_types]; +static int log10_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log10_strided_dispatch_vector[td_ns::num_types]; + +void populate_log10_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log10_fn_ns; + + using fn_ns::Log10ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log10_contig_dispatch_vector); + + using fn_ns::Log10StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log10_strided_dispatch_vector); + + using fn_ns::Log10TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log10_output_typeid_vector); +}; + +} // namespace impl + +void init_log10(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log10_dispatch_vectors(); + using impl::log10_contig_dispatch_vector; + using impl::log10_output_typeid_vector; + using impl::log10_strided_dispatch_vector; + + auto log10_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log10_output_typeid_vector, + log10_contig_dispatch_vector, log10_strided_dispatch_vector); + }; + m.def("_log10", log10_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log10_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + log10_output_typeid_vector); + }; + m.def("_log10_result_type", log10_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp new file mode 100644 index 0000000000..3972695849 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log10.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log10(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp new file mode 100644 index 0000000000..badb474778 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log1p.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log1p.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U21: ==== LOG1P (x) +namespace impl +{ + +namespace log1p_fn_ns = dpctl::tensor::kernels::log1p; + +static unary_contig_impl_fn_ptr_t + log1p_contig_dispatch_vector[td_ns::num_types]; +static int log1p_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log1p_strided_dispatch_vector[td_ns::num_types]; + +void populate_log1p_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log1p_fn_ns; + + using fn_ns::Log1pContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log1p_contig_dispatch_vector); + + using fn_ns::Log1pStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log1p_strided_dispatch_vector); + + using fn_ns::Log1pTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log1p_output_typeid_vector); +}; + +} // namespace impl + +void init_log1p(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log1p_dispatch_vectors(); + using impl::log1p_contig_dispatch_vector; + using impl::log1p_output_typeid_vector; + using impl::log1p_strided_dispatch_vector; + + auto log1p_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log1p_output_typeid_vector, + log1p_contig_dispatch_vector, log1p_strided_dispatch_vector); + }; + m.def("_log1p", log1p_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log1p_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + log1p_output_typeid_vector); + }; + m.def("_log1p_result_type", log1p_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp new file mode 100644 index 0000000000..438b93601c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log1p(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp new file mode 100644 index 0000000000..b5a8a39684 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "log2.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/log2.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U22: ==== LOG2 (x) +namespace impl +{ + +namespace log2_fn_ns = dpctl::tensor::kernels::log2; + +static unary_contig_impl_fn_ptr_t log2_contig_dispatch_vector[td_ns::num_types]; +static int log2_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + log2_strided_dispatch_vector[td_ns::num_types]; + +void populate_log2_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = log2_fn_ns; + + using fn_ns::Log2ContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(log2_contig_dispatch_vector); + + using fn_ns::Log2StridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(log2_strided_dispatch_vector); + + using fn_ns::Log2TypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(log2_output_typeid_vector); +}; + +} // namespace impl + +void init_log2(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_log2_dispatch_vectors(); + using impl::log2_contig_dispatch_vector; + using impl::log2_output_typeid_vector; + using impl::log2_strided_dispatch_vector; + + auto log2_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, log2_output_typeid_vector, + log2_contig_dispatch_vector, log2_strided_dispatch_vector); + }; + m.def("_log2", log2_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto log2_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, log2_output_typeid_vector); + }; + m.def("_log2_result_type", log2_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp new file mode 100644 index 0000000000..4e47ed369a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log2.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_log2(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp new file mode 100644 index 0000000000..77ded230be --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logaddexp.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logaddexp.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B15: ===== LOGADDEXP (x1, x2) +namespace impl +{ +namespace logaddexp_fn_ns = dpctl::tensor::kernels::logaddexp; + +static binary_contig_impl_fn_ptr_t + logaddexp_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logaddexp_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logaddexp_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logaddexp_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logaddexp_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogAddExpTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logaddexp_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogAddExpStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logaddexp_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogAddExpContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logaddexp_contig_dispatch_table); +}; + +} // namespace impl + +void init_logaddexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logaddexp_dispatch_tables(); + using impl::logaddexp_contig_dispatch_table; + using impl::logaddexp_output_id_table; + using impl::logaddexp_strided_dispatch_table; + + auto logaddexp_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logaddexp_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logaddexp_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logaddexp_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logaddexp_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logaddexp_output_id_table); + }; + m.def("_logaddexp", logaddexp_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logaddexp_result_type", logaddexp_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp new file mode 100644 index 0000000000..6601b3f9c5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logaddexp(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp new file mode 100644 index 0000000000..4c573ce508 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_and.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_and.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B16: ===== LOGICAL_AND (x1, x2) +namespace impl +{ +namespace logical_and_fn_ns = dpctl::tensor::kernels::logical_and; + +static binary_contig_impl_fn_ptr_t + logical_and_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_and_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_and_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_and_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_and_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalAndTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_and_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalAndStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_and_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalAndContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_and_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_and(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_and_dispatch_tables(); + using impl::logical_and_contig_dispatch_table; + using impl::logical_and_output_id_table; + using impl::logical_and_strided_dispatch_table; + + auto logical_and_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_and_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_and_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_and_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_and_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_and_output_id_table); + }; + m.def("_logical_and", logical_and_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_and_result_type", logical_and_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp new file mode 100644 index 0000000000..ee73f7c8d5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_and(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp new file mode 100644 index 0000000000..84362cd9ce --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp @@ -0,0 +1,123 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_not.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_not.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U24: ==== LOGICAL_NOT (x) +namespace impl +{ + +namespace logical_not_fn_ns = dpctl::tensor::kernels::logical_not; + +static unary_contig_impl_fn_ptr_t + logical_not_contig_dispatch_vector[td_ns::num_types]; +static int logical_not_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + logical_not_strided_dispatch_vector[td_ns::num_types]; + +void populate_logical_not_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = logical_not_fn_ns; + + using fn_ns::LogicalNotContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(logical_not_contig_dispatch_vector); + + using fn_ns::LogicalNotStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(logical_not_strided_dispatch_vector); + + using fn_ns::LogicalNotTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(logical_not_output_typeid_vector); +}; + +} // namespace impl + +void init_logical_not(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_not_dispatch_vectors(); + using impl::logical_not_contig_dispatch_vector; + using impl::logical_not_output_typeid_vector; + using impl::logical_not_strided_dispatch_vector; + + auto logical_not_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + logical_not_output_typeid_vector, + logical_not_contig_dispatch_vector, + logical_not_strided_dispatch_vector); + }; + m.def("_logical_not", logical_not_pyapi, "", py::arg("src"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + + auto logical_not_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + logical_not_output_typeid_vector); + }; + m.def("_logical_not_result_type", logical_not_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp new file mode 100644 index 0000000000..c1a2c393aa --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_not(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp new file mode 100644 index 0000000000..ebf8251b2e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_or.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_or.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B17: ===== LOGICAL_OR (x1, x2) +namespace impl +{ +namespace logical_or_fn_ns = dpctl::tensor::kernels::logical_or; + +static binary_contig_impl_fn_ptr_t + logical_or_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_or_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_or_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_or_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_or_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalOrTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_or_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalOrStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_or_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalOrContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_or_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_or(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_or_dispatch_tables(); + using impl::logical_or_contig_dispatch_table; + using impl::logical_or_output_id_table; + using impl::logical_or_strided_dispatch_table; + + auto logical_or_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_or_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_or_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_or_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_or_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_or_output_id_table); + }; + m.def("_logical_or", logical_or_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_or_result_type", logical_or_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp new file mode 100644 index 0000000000..00a4ddfcc2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_xor(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp new file mode 100644 index 0000000000..9488a5615a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "logical_xor.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/logical_xor.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B18: ===== LOGICAL_XOR (x1, x2) +namespace impl +{ +namespace logical_xor_fn_ns = dpctl::tensor::kernels::logical_xor; + +static binary_contig_impl_fn_ptr_t + logical_xor_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int logical_xor_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + logical_xor_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_logical_xor_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = logical_xor_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::LogicalXorTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(logical_xor_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::LogicalXorStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(logical_xor_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::LogicalXorContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(logical_xor_contig_dispatch_table); +}; + +} // namespace impl + +void init_logical_xor(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_logical_xor_dispatch_tables(); + using impl::logical_xor_contig_dispatch_table; + using impl::logical_xor_output_id_table; + using impl::logical_xor_strided_dispatch_table; + + auto logical_xor_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, logical_xor_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + logical_xor_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + logical_xor_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto logical_xor_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + logical_xor_output_id_table); + }; + m.def("_logical_xor", logical_xor_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_logical_xor_result_type", logical_xor_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp new file mode 100644 index 0000000000..ad069eb120 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logical_or(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp new file mode 100644 index 0000000000..208bdcf47f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "maximum.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/maximum.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B26: ===== MAXIMUM (x1, x2) +namespace impl +{ +namespace maximum_fn_ns = dpctl::tensor::kernels::maximum; + +static binary_contig_impl_fn_ptr_t + maximum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int maximum_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + maximum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_maximum_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = maximum_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MaximumTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(maximum_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MaximumStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(maximum_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MaximumContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(maximum_contig_dispatch_table); +}; + +} // namespace impl + +void init_maximum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_maximum_dispatch_tables(); + using impl::maximum_contig_dispatch_table; + using impl::maximum_output_id_table; + using impl::maximum_strided_dispatch_table; + + auto maximum_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, maximum_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + maximum_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + maximum_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto maximum_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + maximum_output_id_table); + }; + m.def("_maximum", maximum_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_maximum_result_type", maximum_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp new file mode 100644 index 0000000000..0f49850567 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_maximum(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp new file mode 100644 index 0000000000..dc1a826ac4 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "minimum.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/minimum.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B27: ===== MINIMUM (x1, x2) +namespace impl +{ +namespace minimum_fn_ns = dpctl::tensor::kernels::minimum; + +static binary_contig_impl_fn_ptr_t + minimum_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int minimum_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + minimum_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_minimum_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = minimum_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MinimumTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(minimum_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MinimumStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(minimum_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MinimumContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(minimum_contig_dispatch_table); +}; + +} // namespace impl + +void init_minimum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_minimum_dispatch_tables(); + using impl::minimum_contig_dispatch_table; + using impl::minimum_output_id_table; + using impl::minimum_strided_dispatch_table; + + auto minimum_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, minimum_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + minimum_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + minimum_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto minimum_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + minimum_output_id_table); + }; + m.def("_minimum", minimum_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_minimum_result_type", minimum_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp new file mode 100644 index 0000000000..f1f2467c1e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_minimum(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp new file mode 100644 index 0000000000..c087abd9ff --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp @@ -0,0 +1,230 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "multiply.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/multiply.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B19: ===== MULTIPLY (x1, x2) +namespace impl +{ + +namespace multiply_fn_ns = dpctl::tensor::kernels::multiply; + +static binary_contig_impl_fn_ptr_t + multiply_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int multiply_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + multiply_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// mul(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + multiply_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// mul(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + multiply_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + multiply_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + multiply_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + multiply_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_multiply_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = multiply_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::MultiplyTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(multiply_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::MultiplyStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(multiply_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::MultiplyContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(multiply_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::MultiplyContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + MultiplyContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + multiply_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::MultiplyContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + MultiplyContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + multiply_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::MultiplyInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(multiply_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::MultiplyInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(multiply_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::MultiplyInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(multiply_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_multiply(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_multiply_dispatch_tables(); + using impl::multiply_contig_dispatch_table; + using impl::multiply_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::multiply_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::multiply_output_id_table; + using impl::multiply_strided_dispatch_table; + + auto multiply_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, multiply_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + multiply_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + multiply_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + multiply_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto multiply_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + multiply_output_id_table); + }; + m.def("_multiply", multiply_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_multiply_result_type", multiply_result_type_pyapi, ""); + + using impl::multiply_inplace_contig_dispatch_table; + using impl::multiply_inplace_row_matrix_dispatch_table; + using impl::multiply_inplace_strided_dispatch_table; + + auto multiply_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, multiply_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + multiply_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + multiply_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + multiply_inplace_row_matrix_dispatch_table); + }; + m.def("_multiply_inplace", multiply_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp new file mode 100644 index 0000000000..e110ecbb20 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_multiply(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp new file mode 100644 index 0000000000..bc659506d1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "negative.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/negative.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U25: ==== NEGATIVE (x) +namespace impl +{ + +namespace negative_fn_ns = dpctl::tensor::kernels::negative; + +static unary_contig_impl_fn_ptr_t + negative_contig_dispatch_vector[td_ns::num_types]; +static int negative_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + negative_strided_dispatch_vector[td_ns::num_types]; + +void populate_negative_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = negative_fn_ns; + + using fn_ns::NegativeContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(negative_contig_dispatch_vector); + + using fn_ns::NegativeStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(negative_strided_dispatch_vector); + + using fn_ns::NegativeTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(negative_output_typeid_vector); +}; + +} // namespace impl + +void init_negative(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_negative_dispatch_vectors(); + using impl::negative_contig_dispatch_vector; + using impl::negative_output_typeid_vector; + using impl::negative_strided_dispatch_vector; + + auto negative_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + negative_output_typeid_vector, + negative_contig_dispatch_vector, + negative_strided_dispatch_vector); + }; + m.def("_negative", negative_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto negative_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + negative_output_typeid_vector); + }; + m.def("_negative_result_type", negative_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp new file mode 100644 index 0000000000..048e481b34 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/negative.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_negative(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp new file mode 100644 index 0000000000..a7a3e909cb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp @@ -0,0 +1,140 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "not_equal.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/not_equal.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +// B20: ===== NOT_EQUAL (x1, x2) +namespace impl +{ +namespace not_equal_fn_ns = dpctl::tensor::kernels::not_equal; + +static binary_contig_impl_fn_ptr_t + not_equal_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int not_equal_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + not_equal_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_not_equal_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = not_equal_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::NotEqualTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(not_equal_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::NotEqualStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(not_equal_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::NotEqualContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(not_equal_contig_dispatch_table); +}; + +} // namespace impl + +void init_not_equal(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_not_equal_dispatch_tables(); + using impl::not_equal_contig_dispatch_table; + using impl::not_equal_output_id_table; + using impl::not_equal_strided_dispatch_table; + + auto not_equal_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, not_equal_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + not_equal_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + not_equal_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto not_equal_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + not_equal_output_id_table); + }; + m.def("_not_equal", not_equal_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_not_equal_result_type", not_equal_result_type_pyapi, ""); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp new file mode 100644 index 0000000000..4e1f654e79 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_not_equal(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp new file mode 100644 index 0000000000..eaff0794d2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "positive.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/positive.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U26: ==== POSITIVE (x) +namespace impl +{ + +namespace positive_fn_ns = dpctl::tensor::kernels::positive; + +static unary_contig_impl_fn_ptr_t + positive_contig_dispatch_vector[td_ns::num_types]; +static int positive_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + positive_strided_dispatch_vector[td_ns::num_types]; + +void populate_positive_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = positive_fn_ns; + + using fn_ns::PositiveContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(positive_contig_dispatch_vector); + + using fn_ns::PositiveStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(positive_strided_dispatch_vector); + + using fn_ns::PositiveTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(positive_output_typeid_vector); +}; + +} // namespace impl + +void init_positive(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_positive_dispatch_vectors(); + using impl::positive_contig_dispatch_vector; + using impl::positive_output_typeid_vector; + using impl::positive_strided_dispatch_vector; + + auto positive_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + positive_output_typeid_vector, + positive_contig_dispatch_vector, + positive_strided_dispatch_vector); + }; + m.def("_positive", positive_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto positive_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + positive_output_typeid_vector); + }; + m.def("_positive_result_type", positive_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp new file mode 100644 index 0000000000..a7b19a07ab --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/positive.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_positive(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp new file mode 100644 index 0000000000..a8ef6cb171 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp @@ -0,0 +1,189 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "pow.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/pow.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B21: ===== POW (x1, x2) +namespace impl +{ + +namespace pow_fn_ns = dpctl::tensor::kernels::pow; + +static binary_contig_impl_fn_ptr_t pow_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static int pow_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + pow_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + pow_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + pow_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +void populate_pow_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = pow_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::PowTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(pow_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::PowStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(pow_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::PowContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(pow_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::PowInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(pow_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::PowInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(pow_inplace_contig_dispatch_table); +}; + +} // namespace impl + +void init_pow(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_pow_dispatch_tables(); + using impl::pow_contig_dispatch_table; + using impl::pow_output_id_table; + using impl::pow_strided_dispatch_table; + + auto pow_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, pow_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + pow_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + pow_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto pow_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + pow_output_id_table); + }; + m.def("_pow", pow_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_pow_result_type", pow_result_type_pyapi, ""); + + using impl::pow_inplace_contig_dispatch_table; + using impl::pow_inplace_strided_dispatch_table; + + auto pow_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, pow_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + pow_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + pow_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_pow_inplace", pow_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp new file mode 100644 index 0000000000..7a13b414eb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/pow.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_pow(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp new file mode 100644 index 0000000000..60060084e1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "proj.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/proj.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U40: ==== PROJ (x) +namespace impl +{ + +namespace proj_fn_ns = dpctl::tensor::kernels::proj; + +static unary_contig_impl_fn_ptr_t proj_contig_dispatch_vector[td_ns::num_types]; +static int proj_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + proj_strided_dispatch_vector[td_ns::num_types]; + +void populate_proj_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = proj_fn_ns; + + using fn_ns::ProjContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(proj_contig_dispatch_vector); + + using fn_ns::ProjStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(proj_strided_dispatch_vector); + + using fn_ns::ProjTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(proj_output_typeid_vector); +}; + +} // namespace impl + +void init_proj(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_proj_dispatch_vectors(); + using impl::proj_contig_dispatch_vector; + using impl::proj_output_typeid_vector; + using impl::proj_strided_dispatch_vector; + + auto proj_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, proj_output_typeid_vector, + proj_contig_dispatch_vector, proj_strided_dispatch_vector); + }; + m.def("_proj", proj_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto proj_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, proj_output_typeid_vector); + }; + m.def("_proj_result_type", proj_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp new file mode 100644 index 0000000000..efbe751455 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/proj.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_proj(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp new file mode 100644 index 0000000000..890a308a4e --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "real.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/real.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U27: ==== REAL (x) +namespace impl +{ + +namespace real_fn_ns = dpctl::tensor::kernels::real; + +static unary_contig_impl_fn_ptr_t real_contig_dispatch_vector[td_ns::num_types]; +static int real_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + real_strided_dispatch_vector[td_ns::num_types]; + +void populate_real_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = real_fn_ns; + + using fn_ns::RealContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(real_contig_dispatch_vector); + + using fn_ns::RealStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(real_strided_dispatch_vector); + + using fn_ns::RealTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(real_output_typeid_vector); +}; + +} // namespace impl + +void init_real(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_real_dispatch_vectors(); + using impl::real_contig_dispatch_vector; + using impl::real_output_typeid_vector; + using impl::real_strided_dispatch_vector; + + auto real_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, real_output_typeid_vector, + real_contig_dispatch_vector, real_strided_dispatch_vector); + }; + m.def("_real", real_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto real_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, real_output_typeid_vector); + }; + m.def("_real_result_type", real_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp new file mode 100644 index 0000000000..b380632448 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/real.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_real(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp new file mode 100644 index 0000000000..3255ea7e7f --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp @@ -0,0 +1,190 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "remainder.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/remainder.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B22: ===== REMAINDER (x1, x2) +namespace impl +{ + +namespace remainder_fn_ns = dpctl::tensor::kernels::remainder; + +static binary_contig_impl_fn_ptr_t + remainder_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int remainder_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + remainder_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + remainder_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + remainder_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_remainder_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = remainder_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::RemainderTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(remainder_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::RemainderStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(remainder_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::RemainderContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(remainder_contig_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::RemainderInplaceStridedFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(remainder_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::RemainderInplaceContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(remainder_inplace_contig_dispatch_table); +} + +} // namespace impl + +void init_remainder(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_remainder_dispatch_tables(); + using impl::remainder_contig_dispatch_table; + using impl::remainder_output_id_table; + using impl::remainder_strided_dispatch_table; + + auto remainder_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, remainder_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + remainder_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + remainder_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{}, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + td_ns::NullPtrTable< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{}); + }; + auto remainder_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + remainder_output_id_table); + }; + m.def("_remainder", remainder_pyapi, "", py::arg("src1"), + py::arg("src2"), py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_remainder_result_type", remainder_result_type_pyapi, ""); + + using impl::remainder_inplace_contig_dispatch_table; + using impl::remainder_inplace_strided_dispatch_table; + + auto remainder_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, remainder_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + remainder_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + remainder_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + td_ns::NullPtrTable< + binary_inplace_row_matrix_broadcast_impl_fn_ptr_t>{}); + }; + m.def("_remainder_inplace", remainder_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp new file mode 100644 index 0000000000..ef538547a8 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_remainder(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp new file mode 100644 index 0000000000..cce730b899 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "round.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/round.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U28: ==== ROUND (x) +namespace impl +{ + +namespace round_fn_ns = dpctl::tensor::kernels::round; + +static unary_contig_impl_fn_ptr_t + round_contig_dispatch_vector[td_ns::num_types]; +static int round_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + round_strided_dispatch_vector[td_ns::num_types]; + +void populate_round_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = round_fn_ns; + + using fn_ns::RoundContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(round_contig_dispatch_vector); + + using fn_ns::RoundStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(round_strided_dispatch_vector); + + using fn_ns::RoundTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(round_output_typeid_vector); +}; + +} // namespace impl + +void init_round(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_round_dispatch_vectors(); + using impl::round_contig_dispatch_vector; + using impl::round_output_typeid_vector; + using impl::round_strided_dispatch_vector; + + auto round_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, round_output_typeid_vector, + round_contig_dispatch_vector, round_strided_dispatch_vector); + }; + m.def("_round", round_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto round_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + round_output_typeid_vector); + }; + m.def("_round_result_type", round_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp new file mode 100644 index 0000000000..5753ef233b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/round.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_round(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp new file mode 100644 index 0000000000..4661fdfa48 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "rsqrt.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/rsqrt.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U39: ==== RSQRT (x) +namespace impl +{ + +namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt; + +static unary_contig_impl_fn_ptr_t + rsqrt_contig_dispatch_vector[td_ns::num_types]; +static int rsqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + rsqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_rsqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = rsqrt_fn_ns; + + using fn_ns::RsqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector); + + using fn_ns::RsqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector); + + using fn_ns::RsqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector); +}; + +} // namespace impl + +void init_rsqrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_rsqrt_dispatch_vectors(); + using impl::rsqrt_contig_dispatch_vector; + using impl::rsqrt_output_typeid_vector; + using impl::rsqrt_strided_dispatch_vector; + + auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, rsqrt_output_typeid_vector, + rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector); + }; + m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + rsqrt_output_typeid_vector); + }; + m.def("_rsqrt_result_type", rsqrt_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp new file mode 100644 index 0000000000..50efc16d79 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_rsqrt(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp new file mode 100644 index 0000000000..7b7c2c22e5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sign.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sign.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U29: ==== SIGN (x) +namespace impl +{ + +namespace sign_fn_ns = dpctl::tensor::kernels::sign; + +static unary_contig_impl_fn_ptr_t sign_contig_dispatch_vector[td_ns::num_types]; +static int sign_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sign_strided_dispatch_vector[td_ns::num_types]; + +void populate_sign_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sign_fn_ns; + + using fn_ns::SignContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sign_contig_dispatch_vector); + + using fn_ns::SignStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sign_strided_dispatch_vector); + + using fn_ns::SignTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sign_output_typeid_vector); +}; + +} // namespace impl + +void init_sign(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sign_dispatch_vectors(); + using impl::sign_contig_dispatch_vector; + using impl::sign_output_typeid_vector; + using impl::sign_strided_dispatch_vector; + + auto sign_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sign_output_typeid_vector, + sign_contig_dispatch_vector, sign_strided_dispatch_vector); + }; + m.def("_sign", sign_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sign_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sign_output_typeid_vector); + }; + m.def("_sign_result_type", sign_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp new file mode 100644 index 0000000000..fa01370842 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sign.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sign(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp new file mode 100644 index 0000000000..fc101dd64b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp @@ -0,0 +1,122 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "signbit.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/signbit.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U41: ==== SIGNBIT (x) +namespace impl +{ + +namespace signbit_fn_ns = dpctl::tensor::kernels::signbit; + +static unary_contig_impl_fn_ptr_t + signbit_contig_dispatch_vector[td_ns::num_types]; +static int signbit_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + signbit_strided_dispatch_vector[td_ns::num_types]; + +void populate_signbit_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = signbit_fn_ns; + + using fn_ns::SignbitContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(signbit_contig_dispatch_vector); + + using fn_ns::SignbitStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(signbit_strided_dispatch_vector); + + using fn_ns::SignbitTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(signbit_output_typeid_vector); +}; + +} // namespace impl + +void init_signbit(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_signbit_dispatch_vectors(); + using impl::signbit_contig_dispatch_vector; + using impl::signbit_output_typeid_vector; + using impl::signbit_strided_dispatch_vector; + + auto signbit_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc(src, dst, exec_q, depends, + signbit_output_typeid_vector, + signbit_contig_dispatch_vector, + signbit_strided_dispatch_vector); + }; + m.def("_signbit", signbit_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto signbit_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + signbit_output_typeid_vector); + }; + m.def("_signbit_result_type", signbit_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp new file mode 100644 index 0000000000..85054bb4de --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_signbit(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp new file mode 100644 index 0000000000..415dc15133 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sin.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sin.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U30: ==== SIN (x) +namespace impl +{ + +namespace sin_fn_ns = dpctl::tensor::kernels::sin; + +static unary_contig_impl_fn_ptr_t sin_contig_dispatch_vector[td_ns::num_types]; +static int sin_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sin_strided_dispatch_vector[td_ns::num_types]; + +void populate_sin_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sin_fn_ns; + + using fn_ns::SinContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sin_contig_dispatch_vector); + + using fn_ns::SinStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sin_strided_dispatch_vector); + + using fn_ns::SinTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sin_output_typeid_vector); +}; + +} // namespace impl + +void init_sin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sin_dispatch_vectors(); + using impl::sin_contig_dispatch_vector; + using impl::sin_output_typeid_vector; + using impl::sin_strided_dispatch_vector; + + auto sin_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sin_output_typeid_vector, + sin_contig_dispatch_vector, sin_strided_dispatch_vector); + }; + m.def("_sin", sin_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sin_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sin_output_typeid_vector); + }; + m.def("_sin_result_type", sin_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp new file mode 100644 index 0000000000..bd03604b16 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sin.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sin(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp new file mode 100644 index 0000000000..d9f92eb8f1 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sinh.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sinh.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U31: ==== SINH (x) +namespace impl +{ + +namespace sinh_fn_ns = dpctl::tensor::kernels::sinh; + +static unary_contig_impl_fn_ptr_t sinh_contig_dispatch_vector[td_ns::num_types]; +static int sinh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sinh_strided_dispatch_vector[td_ns::num_types]; + +void populate_sinh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sinh_fn_ns; + + using fn_ns::SinhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sinh_contig_dispatch_vector); + + using fn_ns::SinhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sinh_strided_dispatch_vector); + + using fn_ns::SinhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sinh_output_typeid_vector); +}; + +} // namespace impl + +void init_sinh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sinh_dispatch_vectors(); + using impl::sinh_contig_dispatch_vector; + using impl::sinh_output_typeid_vector; + using impl::sinh_strided_dispatch_vector; + + auto sinh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sinh_output_typeid_vector, + sinh_contig_dispatch_vector, sinh_strided_dispatch_vector); + }; + m.def("_sinh", sinh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sinh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sinh_output_typeid_vector); + }; + m.def("_sinh_result_type", sinh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp new file mode 100644 index 0000000000..fef8ec416a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sinh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp new file mode 100644 index 0000000000..159d45b51c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "sqrt.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/sqrt.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U33: ==== SQRT (x) +namespace impl +{ + +namespace sqrt_fn_ns = dpctl::tensor::kernels::sqrt; + +static unary_contig_impl_fn_ptr_t sqrt_contig_dispatch_vector[td_ns::num_types]; +static int sqrt_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + sqrt_strided_dispatch_vector[td_ns::num_types]; + +void populate_sqrt_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = sqrt_fn_ns; + + using fn_ns::SqrtContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(sqrt_contig_dispatch_vector); + + using fn_ns::SqrtStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(sqrt_strided_dispatch_vector); + + using fn_ns::SqrtTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(sqrt_output_typeid_vector); +}; + +} // namespace impl + +void init_sqrt(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_sqrt_dispatch_vectors(); + using impl::sqrt_contig_dispatch_vector; + using impl::sqrt_output_typeid_vector; + using impl::sqrt_strided_dispatch_vector; + + auto sqrt_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, sqrt_output_typeid_vector, + sqrt_contig_dispatch_vector, sqrt_strided_dispatch_vector); + }; + m.def("_sqrt", sqrt_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sqrt_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, sqrt_output_typeid_vector); + }; + m.def("_sqrt_result_type", sqrt_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp new file mode 100644 index 0000000000..38ea68635b --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sqrt(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp new file mode 100644 index 0000000000..184e09c19c --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "square.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/square.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U32: ==== SQUARE (x) +namespace impl +{ + +namespace square_fn_ns = dpctl::tensor::kernels::square; + +static unary_contig_impl_fn_ptr_t + square_contig_dispatch_vector[td_ns::num_types]; +static int square_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + square_strided_dispatch_vector[td_ns::num_types]; + +void populate_square_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = square_fn_ns; + + using fn_ns::SquareContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(square_contig_dispatch_vector); + + using fn_ns::SquareStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(square_strided_dispatch_vector); + + using fn_ns::SquareTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(square_output_typeid_vector); +}; + +} // namespace impl + +void init_square(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_square_dispatch_vectors(); + using impl::square_contig_dispatch_vector; + using impl::square_output_typeid_vector; + using impl::square_strided_dispatch_vector; + + auto square_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, square_output_typeid_vector, + square_contig_dispatch_vector, square_strided_dispatch_vector); + }; + m.def("_square", square_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto square_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + square_output_typeid_vector); + }; + m.def("_square_result_type", square_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp new file mode 100644 index 0000000000..d8268b728a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/square.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_square(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp new file mode 100644 index 0000000000..9703182e7a --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp @@ -0,0 +1,229 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "subtract.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/subtract.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B23: ===== SUBTRACT (x1, x2) +namespace impl +{ +namespace subtract_fn_ns = dpctl::tensor::kernels::subtract; + +static binary_contig_impl_fn_ptr_t + subtract_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int subtract_output_id_table[td_ns::num_types][td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + subtract_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// sub(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + subtract_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// sub(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + subtract_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + subtract_inplace_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + subtract_inplace_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + subtract_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_subtract_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = subtract_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::SubtractTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(subtract_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::SubtractStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(subtract_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::SubtractContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(subtract_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::SubtractContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + SubtractContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + subtract_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::SubtractContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + SubtractContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + subtract_contig_row_contig_matrix_broadcast_dispatch_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::SubtractInplaceStridedFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(subtract_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::SubtractInplaceContigFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(subtract_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::SubtractInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(subtract_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_subtract(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_subtract_dispatch_tables(); + using impl::subtract_contig_dispatch_table; + using impl::subtract_contig_matrix_contig_row_broadcast_dispatch_table; + using impl::subtract_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::subtract_output_id_table; + using impl::subtract_strided_dispatch_table; + + auto subtract_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, subtract_output_id_table, + // function pointers to handle operation on contiguous + // arrays (pointers may be nullptr) + subtract_contig_dispatch_table, + // function pointers to handle operation on strided arrays + // (most general case) + subtract_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix + // and c-contig row with broadcasting (may be nullptr) + subtract_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto subtract_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + subtract_output_id_table); + }; + m.def("_subtract", subtract_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_subtract_result_type", subtract_result_type_pyapi, ""); + + using impl::subtract_inplace_contig_dispatch_table; + using impl::subtract_inplace_row_matrix_dispatch_table; + using impl::subtract_inplace_strided_dispatch_table; + + auto subtract_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, subtract_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + subtract_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + subtract_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + subtract_inplace_row_matrix_dispatch_table); + }; + m.def("_subtract_inplace", subtract_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp new file mode 100644 index 0000000000..0a4d707865 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_subtract(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp new file mode 100644 index 0000000000..2f1fbf55f2 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "tan.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/tan.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U34: ==== TAN (x) +namespace impl +{ + +namespace tan_fn_ns = dpctl::tensor::kernels::tan; + +static unary_contig_impl_fn_ptr_t tan_contig_dispatch_vector[td_ns::num_types]; +static int tan_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + tan_strided_dispatch_vector[td_ns::num_types]; + +void populate_tan_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = tan_fn_ns; + + using fn_ns::TanContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(tan_contig_dispatch_vector); + + using fn_ns::TanStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(tan_strided_dispatch_vector); + + using fn_ns::TanTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(tan_output_typeid_vector); +}; + +} // namespace impl + +void init_tan(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_tan_dispatch_vectors(); + using impl::tan_contig_dispatch_vector; + using impl::tan_output_typeid_vector; + using impl::tan_strided_dispatch_vector; + + auto tan_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, tan_output_typeid_vector, + tan_contig_dispatch_vector, tan_strided_dispatch_vector); + }; + m.def("_tan", tan_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto tan_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, tan_output_typeid_vector); + }; + m.def("_tan_result_type", tan_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp new file mode 100644 index 0000000000..f89c8b8f6d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tan.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_tanh(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp new file mode 100644 index 0000000000..033389e46d --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp @@ -0,0 +1,119 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "tanh.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/tanh.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U35: ==== TANH (x) +namespace impl +{ + +namespace tanh_fn_ns = dpctl::tensor::kernels::tanh; + +static unary_contig_impl_fn_ptr_t tanh_contig_dispatch_vector[td_ns::num_types]; +static int tanh_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + tanh_strided_dispatch_vector[td_ns::num_types]; + +void populate_tanh_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = tanh_fn_ns; + + using fn_ns::TanhContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(tanh_contig_dispatch_vector); + + using fn_ns::TanhStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(tanh_strided_dispatch_vector); + + using fn_ns::TanhTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(tanh_output_typeid_vector); +}; + +} // namespace impl + +void init_tanh(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_tanh_dispatch_vectors(); + using impl::tanh_contig_dispatch_vector; + using impl::tanh_output_typeid_vector; + using impl::tanh_strided_dispatch_vector; + + auto tanh_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, tanh_output_typeid_vector, + tanh_contig_dispatch_vector, tanh_strided_dispatch_vector); + }; + m.def("_tanh", tanh_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto tanh_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, tanh_output_typeid_vector); + }; + m.def("_tanh_result_type", tanh_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp new file mode 100644 index 0000000000..e456182971 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_tan(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp new file mode 100644 index 0000000000..22ad9bf3cb --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -0,0 +1,241 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "true_divide.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/common_inplace.hpp" +#include "kernels/elementwise_functions/true_divide.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::binary_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_strided_impl_fn_ptr_t; + +using ew_cmn_ns::binary_inplace_contig_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_row_matrix_broadcast_impl_fn_ptr_t; +using ew_cmn_ns::binary_inplace_strided_impl_fn_ptr_t; + +// B08: ===== DIVIDE (x1, x2) +namespace impl +{ +namespace true_divide_fn_ns = dpctl::tensor::kernels::true_divide; + +static binary_contig_impl_fn_ptr_t + true_divide_contig_dispatch_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_output_id_table[td_ns::num_types][td_ns::num_types]; +static int true_divide_inplace_output_id_table[td_ns::num_types] + [td_ns::num_types]; + +static binary_strided_impl_fn_ptr_t + true_divide_strided_dispatch_table[td_ns::num_types][td_ns::num_types]; + +// divide(matrix, row) +static binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t + true_divide_contig_matrix_contig_row_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +// divide(row, matrix) +static binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t + true_divide_contig_row_contig_matrix_broadcast_dispatch_table + [td_ns::num_types][td_ns::num_types]; + +static binary_inplace_contig_impl_fn_ptr_t + true_divide_inplace_contig_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_strided_impl_fn_ptr_t + true_divide_inplace_strided_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static binary_inplace_row_matrix_broadcast_impl_fn_ptr_t + true_divide_inplace_row_matrix_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_true_divide_dispatch_tables(void) +{ + using namespace td_ns; + namespace fn_ns = true_divide_fn_ns; + + // which input types are supported, and what is the type of the result + using fn_ns::TrueDivideTypeMapFactory; + DispatchTableBuilder dtb1; + dtb1.populate_dispatch_table(true_divide_output_id_table); + + // function pointers for operation on general strided arrays + using fn_ns::TrueDivideStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(true_divide_strided_dispatch_table); + + // function pointers for operation on contiguous inputs and output + using fn_ns::TrueDivideContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(true_divide_contig_dispatch_table); + + // function pointers for operation on contiguous matrix, contiguous row + // with contiguous matrix output + using fn_ns::TrueDivideContigMatrixContigRowBroadcastFactory; + DispatchTableBuilder< + binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t, + TrueDivideContigMatrixContigRowBroadcastFactory, num_types> + dtb4; + dtb4.populate_dispatch_table( + true_divide_contig_matrix_contig_row_broadcast_dispatch_table); + + // function pointers for operation on contiguous row, contiguous matrix + // with contiguous matrix output + using fn_ns::TrueDivideContigRowContigMatrixBroadcastFactory; + DispatchTableBuilder< + binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t, + TrueDivideContigRowContigMatrixBroadcastFactory, num_types> + dtb5; + dtb5.populate_dispatch_table( + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + + // which input types are supported, and what is the type of the result + using fn_ns::TrueDivideInplaceTypeMapFactory; + DispatchTableBuilder dtb6; + dtb6.populate_dispatch_table(true_divide_inplace_output_id_table); + + // function pointers for inplace operation on general strided arrays + using fn_ns::TrueDivideInplaceStridedFactory; + DispatchTableBuilder + dtb7; + dtb7.populate_dispatch_table(true_divide_inplace_strided_dispatch_table); + + // function pointers for inplace operation on contiguous inputs and output + using fn_ns::TrueDivideInplaceContigFactory; + DispatchTableBuilder + dtb8; + dtb8.populate_dispatch_table(true_divide_inplace_contig_dispatch_table); + + // function pointers for inplace operation on contiguous matrix + // and contiguous row + using fn_ns::TrueDivideInplaceRowMatrixBroadcastFactory; + DispatchTableBuilder + dtb9; + dtb9.populate_dispatch_table(true_divide_inplace_row_matrix_dispatch_table); +}; + +} // namespace impl + +void init_divide(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_true_divide_dispatch_tables(); + using impl::true_divide_contig_dispatch_table; + using impl:: + true_divide_contig_matrix_contig_row_broadcast_dispatch_table; + using impl:: + true_divide_contig_row_contig_matrix_broadcast_dispatch_table; + using impl::true_divide_output_id_table; + using impl::true_divide_strided_dispatch_table; + + auto divide_pyapi = [&](const arrayT &src1, const arrayT &src2, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_ufunc( + src1, src2, dst, exec_q, depends, true_divide_output_id_table, + // function pointers to handle operation on contiguous arrays + // (pointers may be nullptr) + true_divide_contig_dispatch_table, + // function pointers to handle operation on strided arrays (most + // general case) + true_divide_strided_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_matrix_contig_row_broadcast_dispatch_table, + // function pointers to handle operation of c-contig matrix and + // c-contig row with broadcasting (may be nullptr) + true_divide_contig_row_contig_matrix_broadcast_dispatch_table); + }; + auto divide_result_type_pyapi = [&](const py::dtype &dtype1, + const py::dtype &dtype2) { + return py_binary_ufunc_result_type(dtype1, dtype2, + true_divide_output_id_table); + }; + m.def("_divide", divide_pyapi, "", py::arg("src1"), py::arg("src2"), + py::arg("dst"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + m.def("_divide_result_type", divide_result_type_pyapi, ""); + + using impl::true_divide_inplace_contig_dispatch_table; + using impl::true_divide_inplace_output_id_table; + using impl::true_divide_inplace_row_matrix_dispatch_table; + using impl::true_divide_inplace_strided_dispatch_table; + + auto divide_inplace_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_binary_inplace_ufunc( + src, dst, exec_q, depends, true_divide_inplace_output_id_table, + // function pointers to handle inplace operation on + // contiguous arrays (pointers may be nullptr) + true_divide_inplace_contig_dispatch_table, + // function pointers to handle inplace operation on strided + // arrays (most general case) + true_divide_inplace_strided_dispatch_table, + // function pointers to handle inplace operation on + // c-contig matrix with c-contig row with broadcasting + // (may be nullptr) + true_divide_inplace_row_matrix_dispatch_table); + }; + m.def("_divide_inplace", divide_inplace_pyapi, "", py::arg("lhs"), + py::arg("rhs"), py::arg("sycl_queue"), + py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp new file mode 100644 index 0000000000..e29b858dae --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_divide(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp new file mode 100644 index 0000000000..5b2f451fb0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp @@ -0,0 +1,121 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "elementwise_functions.hpp" +#include "trunc.hpp" +#include "utils/type_dispatch.hpp" + +#include "kernels/elementwise_functions/common.hpp" +#include "kernels/elementwise_functions/trunc.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace ew_cmn_ns = dpctl::tensor::kernels::elementwise_common; +using ew_cmn_ns::unary_contig_impl_fn_ptr_t; +using ew_cmn_ns::unary_strided_impl_fn_ptr_t; + +// U36: ==== TRUNC (x) +namespace impl +{ + +namespace trunc_fn_ns = dpctl::tensor::kernels::trunc; + +static unary_contig_impl_fn_ptr_t + trunc_contig_dispatch_vector[td_ns::num_types]; +static int trunc_output_typeid_vector[td_ns::num_types]; +static unary_strided_impl_fn_ptr_t + trunc_strided_dispatch_vector[td_ns::num_types]; + +void populate_trunc_dispatch_vectors(void) +{ + using namespace td_ns; + namespace fn_ns = trunc_fn_ns; + + using fn_ns::TruncContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(trunc_contig_dispatch_vector); + + using fn_ns::TruncStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(trunc_strided_dispatch_vector); + + using fn_ns::TruncTypeMapFactory; + DispatchVectorBuilder dvb3; + dvb3.populate_dispatch_vector(trunc_output_typeid_vector); +}; + +} // namespace impl + +void init_trunc(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + impl::populate_trunc_dispatch_vectors(); + using impl::trunc_contig_dispatch_vector; + using impl::trunc_output_typeid_vector; + using impl::trunc_strided_dispatch_vector; + + auto trunc_pyapi = [&](const arrayT &src, const arrayT &dst, + sycl::queue &exec_q, + const event_vecT &depends = {}) { + return py_unary_ufunc( + src, dst, exec_q, depends, trunc_output_typeid_vector, + trunc_contig_dispatch_vector, trunc_strided_dispatch_vector); + }; + m.def("_trunc", trunc_pyapi, "", py::arg("src"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto trunc_result_type_pyapi = [&](const py::dtype &dtype) { + return py_unary_ufunc_result_type(dtype, + trunc_output_typeid_vector); + }; + m.def("_trunc_result_type", trunc_result_type_pyapi); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp new file mode 100644 index 0000000000..cc28397f55 --- /dev/null +++ b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.hpp @@ -0,0 +1,42 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for elementwise operations. +//===----------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_trunc(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 0e8b4236b6..282aecc95d 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -42,7 +42,7 @@ #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" -#include "elementwise_functions.hpp" +#include "elementwise_functions/elementwise_common.hpp" #include "eye_ctor.hpp" #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" diff --git a/dpctl/tests/elementwise/test_bitwise_and.py b/dpctl/tests/elementwise/test_bitwise_and.py index b3a5bd665b..824e319709 100644 --- a/dpctl/tests/elementwise/test_bitwise_and.py +++ b/dpctl/tests/elementwise/test_bitwise_and.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -85,3 +86,58 @@ def test_bitwise_and_bool(): r_lo = dpt.logical_and(x1[:, dpt.newaxis], x2[dpt.newaxis]) assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_and_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X &= False + else: + X &= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_and_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 &= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 &= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 &= ar2 + dpt.bitwise_and(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_and(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_and(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.bitwise_and(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_left_shift.py b/dpctl/tests/elementwise/test_bitwise_left_shift.py index cee1019353..06684ac13b 100644 --- a/dpctl/tests/elementwise/test_bitwise_left_shift.py +++ b/dpctl/tests/elementwise/test_bitwise_left_shift.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -97,3 +98,54 @@ def test_bitwise_left_shift_range(op_dtype): z = dpt.bitwise_left_shift(x, y) assert dpt.all(dpt.equal(z, 0)) + + +@pytest.mark.parametrize("dtype", _integral_dtypes) +def test_bitwise_left_shift_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + X <<= int(0) + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_left_shift_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 <<= ar2 + assert dpt.all(ar1 == 2) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 <<= ar4 + assert dpt.all(ar3 == 2) + else: + with pytest.raises(TypeError): + ar1 <<= ar2 + dpt.bitwise_left_shift(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_left_shift(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 2) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_left_shift(ar3, ar4, out=ar4) + dpt.all(ar4 == 2) + else: + with pytest.raises(TypeError): + dpt.bitwise_left_shift(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_or.py b/dpctl/tests/elementwise/test_bitwise_or.py index d273bd1507..49949cb795 100644 --- a/dpctl/tests/elementwise/test_bitwise_or.py +++ b/dpctl/tests/elementwise/test_bitwise_or.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -85,3 +86,58 @@ def test_bitwise_or_bool(): r_lo = dpt.logical_or(x1[:, dpt.newaxis], x2[dpt.newaxis]) assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_or_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X |= False + else: + X |= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_or_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 |= ar2 + assert dpt.all(ar1 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 |= ar4 + assert dpt.all(ar3 == 1) + else: + with pytest.raises(TypeError): + ar1 |= ar2 + dpt.bitwise_or(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_or(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 1) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_or(ar3, ar4, out=ar4) + dpt.all(ar4 == 1) + else: + with pytest.raises(TypeError): + dpt.bitwise_or(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_right_shift.py b/dpctl/tests/elementwise/test_bitwise_right_shift.py index ceadb9414d..37112133db 100644 --- a/dpctl/tests/elementwise/test_bitwise_right_shift.py +++ b/dpctl/tests/elementwise/test_bitwise_right_shift.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -97,3 +98,54 @@ def test_bitwise_right_shift_range(op_dtype): z = dpt.bitwise_right_shift(x, y) assert dpt.all(dpt.equal(z, 0)) + + +@pytest.mark.parametrize("dtype", _integral_dtypes) +def test_bitwise_right_shift_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + X >>= int(0) + + +@pytest.mark.parametrize("op1_dtype", _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", _integral_dtypes) +def test_bitwise_right_shift_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 >>= ar2 + assert dpt.all(ar1 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 >>= ar4 + assert dpt.all(ar3 == 0) + else: + with pytest.raises(TypeError): + ar1 >>= ar2 + dpt.bitwise_right_shift(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_right_shift(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_right_shift(ar3, ar4, out=ar4) + dpt.all(ar4 == 0) + else: + with pytest.raises(TypeError): + dpt.bitwise_right_shift(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_bitwise_xor.py b/dpctl/tests/elementwise/test_bitwise_xor.py index b2cb11bc84..e9501b642f 100644 --- a/dpctl/tests/elementwise/test_bitwise_xor.py +++ b/dpctl/tests/elementwise/test_bitwise_xor.py @@ -18,6 +18,7 @@ import pytest import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _integral_dtypes @@ -85,3 +86,58 @@ def test_bitwise_xor_bool(): r_lo = dpt.logical_xor(x1[:, dpt.newaxis], x2[dpt.newaxis]) assert dpt.all(dpt.equal(r_bw, r_lo)) + + +@pytest.mark.parametrize("dtype", ["?"] + _integral_dtypes) +def test_bitwise_xor_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.zeros((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind == "b": + X ^= False + else: + X ^= int(0) + + +@pytest.mark.parametrize("op1_dtype", ["?"] + _integral_dtypes) +@pytest.mark.parametrize("op2_dtype", ["?"] + _integral_dtypes) +def test_bitwise_xor_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 ^= ar2 + assert dpt.all(ar1 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + ar3 ^= ar4 + assert dpt.all(ar3 == 0) + else: + with pytest.raises(TypeError): + ar1 ^= ar2 + dpt.bitwise_xor(ar1, ar2, out=ar1) + + # out is second arg + ar1 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype, sycl_queue=q) + if _can_cast(ar1.dtype, ar2.dtype, _fp16, _fp64): + dpt.bitwise_xor(ar1, ar2, out=ar2) + assert dpt.all(ar2 == 0) + + ar3 = dpt.ones(sz, dtype=op1_dtype, sycl_queue=q)[::-1] + ar4 = dpt.ones(2 * sz, dtype=op2_dtype, sycl_queue=q)[::2] + dpt.bitwise_xor(ar3, ar4, out=ar4) + dpt.all(ar4 == 0) + else: + with pytest.raises(TypeError): + dpt.bitwise_xor(ar1, ar2, out=ar2) diff --git a/dpctl/tests/elementwise/test_pow.py b/dpctl/tests/elementwise/test_pow.py index 1f13e2b533..8b76e3a9fc 100644 --- a/dpctl/tests/elementwise/test_pow.py +++ b/dpctl/tests/elementwise/test_pow.py @@ -21,6 +21,7 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _all_dtypes, _compare_dtypes, _usm_types @@ -152,3 +153,60 @@ def test_pow_python_scalar(arr_dt): assert isinstance(R, dpt.usm_ndarray) R = dpt.pow(sc, X) assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _all_dtypes[1:]) +def test_pow_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X **= int(1) + elif dt_kind == "f": + X **= float(1) + elif dt_kind == "c": + X **= complex(1) + + +@pytest.mark.parametrize("op1_dtype", _all_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _all_dtypes[1:]) +def test_pow_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 **= ar2 + assert ( + dpt.asnumpy(ar1) == np.full(ar1.shape, 1, dtype=ar1.dtype) + ).all() + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] *= ar4[::2] + assert ( + dpt.asnumpy(ar3) == np.full(ar3.shape, 1, dtype=ar3.dtype) + ).all() + + else: + with pytest.raises(TypeError): + ar1 **= ar2 + + +def test_pow_inplace_basic(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + expected = dpt.square(x) + x **= 2 + + assert dpt.all(x == expected) diff --git a/dpctl/tests/elementwise/test_remainder.py b/dpctl/tests/elementwise/test_remainder.py index def594f269..47500954a2 100644 --- a/dpctl/tests/elementwise/test_remainder.py +++ b/dpctl/tests/elementwise/test_remainder.py @@ -21,6 +21,7 @@ import dpctl import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported from .utils import _compare_dtypes, _no_complex_dtypes, _usm_types @@ -206,3 +207,54 @@ def test_remainder_python_scalar(arr_dt): assert isinstance(R, dpt.usm_ndarray) R = dpt.remainder(sc, X) assert isinstance(R, dpt.usm_ndarray) + + +@pytest.mark.parametrize("dtype", _no_complex_dtypes[1:]) +def test_remainder_inplace_python_scalar(dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + X = dpt.ones((10, 10), dtype=dtype, sycl_queue=q) + dt_kind = X.dtype.kind + if dt_kind in "ui": + X %= int(1) + elif dt_kind == "f": + X %= float(1) + + +@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes[1:]) +def test_remainder_inplace_dtype_matrix(op1_dtype, op2_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(op1_dtype, q) + skip_if_dtype_not_supported(op2_dtype, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=op1_dtype) + ar2 = dpt.ones_like(ar1, dtype=op2_dtype) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + if _can_cast(ar2.dtype, ar1.dtype, _fp16, _fp64): + ar1 %= ar2 + assert dpt.all(ar1 == dpt.zeros(ar1.shape, dtype=ar1.dtype)) + + ar3 = dpt.ones(sz, dtype=op1_dtype) + ar4 = dpt.ones(2 * sz, dtype=op2_dtype) + + ar3[::-1] %= ar4[::2] + assert dpt.all(ar3 == dpt.zeros(ar3.shape, dtype=ar3.dtype)) + + else: + with pytest.raises(TypeError): + ar1 %= ar2 + + +def test_remainder_inplace_basic(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + expected = x & 1 + x %= 2 + + assert dpt.all(x == expected) From 2eba93eac5b1767822fe37af731e38da8a2575a3 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Wed, 25 Oct 2023 13:22:43 -0700 Subject: [PATCH 53/83] Implements ``dpctl.tensor.clip`` (#1444) * Implements dpctl.tensor.clip * Clip now consistently yields max where max < min sycl::clamp would yield max or min depending on the platform A test has been added for this behavior * Adds more tests for clip * Removed redundant branches in clip and elementwise function calls As the result dtype of the out array is already checked when overlap is checked, checking again later is superfluous * Removed more redundant logic from clip * Fixed order logic in clip Now properly accounts for all three arrays in all branches * Adds more compute follows data tests for clip * Tests to increase coverage of _clip.py (#1451) * Clip raises ValueError when types cannot be resolved --------- Co-authored-by: Oleksandr Pavlyk --- dpctl/tensor/CMakeLists.txt | 2 + dpctl/tensor/__init__.py | 2 + dpctl/tensor/_clip.py | 837 ++++++++++++++++++ dpctl/tensor/_elementwise_common.py | 7 +- .../tensor/libtensor/include/kernels/clip.hpp | 311 +++++++ dpctl/tensor/libtensor/source/clip.cpp | 269 ++++++ dpctl/tensor/libtensor/source/clip.hpp | 52 ++ dpctl/tensor/libtensor/source/tensor_py.cpp | 14 + dpctl/tests/test_tensor_clip.py | 627 +++++++++++++ 9 files changed, 2115 insertions(+), 6 deletions(-) create mode 100644 dpctl/tensor/_clip.py create mode 100644 dpctl/tensor/libtensor/include/kernels/clip.hpp create mode 100644 dpctl/tensor/libtensor/source/clip.cpp create mode 100644 dpctl/tensor/libtensor/source/clip.hpp create mode 100644 dpctl/tests/test_tensor_clip.py diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 35ca62198f..5247b4953b 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -121,6 +121,7 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) list(APPEND _tensor_impl_sources ${_elementwise_sources} @@ -138,6 +139,7 @@ set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) list(APPEND _no_fast_math_sources ${_elementwise_sources} diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index bab31379b7..209a6d4e56 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -93,6 +93,7 @@ from dpctl.tensor._usmarray import usm_ndarray from dpctl.tensor._utility_functions import all, any +from ._clip import clip from ._constants import e, inf, nan, newaxis, pi from ._elementwise_funcs import ( abs, @@ -322,4 +323,5 @@ "exp2", "copysign", "rsqrt", + "clip", ] diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py new file mode 100644 index 0000000000..5a3a96933f --- /dev/null +++ b/dpctl/tensor/_clip.py @@ -0,0 +1,837 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_impl as ti +from dpctl.tensor._copy_utils import ( + _empty_like_orderK, + _empty_like_pair_orderK, + _empty_like_triple_orderK, +) +from dpctl.tensor._elementwise_common import ( + WeakBooleanType, + WeakComplexType, + WeakFloatingType, + WeakIntegralType, + _get_dtype, + _get_queue_usm_type, + _get_shape, + _strong_dtype_num_kind, + _validate_dtype, + _weak_type_num_kind, +) +from dpctl.tensor._manipulation_functions import _broadcast_shape_impl +from dpctl.tensor._type_utils import _can_cast, _to_device_supported_dtype +from dpctl.utils import ExecutionPlacementError + + +def _resolve_one_strong_two_weak_types(st_dtype, dtype1, dtype2, dev): + "Resolves weak data types per NEP-0050," + "where the second and third arguments are" + "permitted to be weak types" + if isinstance( + st_dtype, + ( + WeakBooleanType, + WeakIntegralType, + WeakFloatingType, + WeakComplexType, + ), + ): + raise ValueError + if isinstance( + dtype1, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + if isinstance( + dtype2, + ( + WeakBooleanType, + WeakIntegralType, + WeakFloatingType, + WeakComplexType, + ), + ): + kind_num1 = _weak_type_num_kind(dtype1) + kind_num2 = _weak_type_num_kind(dtype2) + st_kind_num = _strong_dtype_num_kind(st_dtype) + + if kind_num1 > st_kind_num: + if isinstance(dtype1, WeakIntegralType): + ret_dtype1 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype1, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype1 = dpt.complex64 + ret_dtype1 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype1 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype1 = st_dtype + + if kind_num2 > st_kind_num: + if isinstance(dtype2, WeakIntegralType): + ret_dtype2 = dpt.dtype(ti.default_device_int_type(dev)) + elif isinstance(dtype2, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + ret_dtype2 = dpt.complex64 + ret_dtype2 = _to_device_supported_dtype(dpt.complex128, dev) + else: + ret_dtype2 = _to_device_supported_dtype(dpt.float64, dev) + else: + ret_dtype2 = st_dtype + + return ret_dtype1, ret_dtype2 + + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype2), dtype2), + ] + ) + dt1_kind_num = _weak_type_num_kind(dtype1) + if dt1_kind_num > max_dt_num_kind: + if isinstance(dtype1, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)), dtype2 + if isinstance(dtype1, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dpt.complex64, dtype2 + return ( + _to_device_supported_dtype(dpt.complex128, dev), + dtype2, + ) + return _to_device_supported_dtype(dpt.float64, dev), dtype2 + else: + return max_dtype, dtype2 + elif isinstance( + dtype2, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + max_dt_num_kind, max_dtype = max( + [ + (_strong_dtype_num_kind(st_dtype), st_dtype), + (_strong_dtype_num_kind(dtype1), dtype1), + ] + ) + dt2_kind_num = _weak_type_num_kind(dtype2) + if dt2_kind_num > max_dt_num_kind: + if isinstance(dtype2, WeakIntegralType): + return dtype1, dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype2, WeakComplexType): + if max_dtype is dpt.float16 or max_dtype is dpt.float32: + return dtype1, dpt.complex64 + return ( + dtype1, + _to_device_supported_dtype(dpt.complex128, dev), + ) + return dtype1, _to_device_supported_dtype(dpt.float64, dev) + else: + return dtype1, max_dtype + else: + # both are strong dtypes + # return unmodified + return dtype1, dtype2 + + +def _resolve_one_strong_one_weak_types(st_dtype, dtype, dev): + "Resolves one weak data type with one strong data type per NEP-0050" + if isinstance( + st_dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + raise ValueError + if isinstance( + dtype, + (WeakBooleanType, WeakIntegralType, WeakFloatingType, WeakComplexType), + ): + st_kind_num = _strong_dtype_num_kind(st_dtype) + kind_num = _weak_type_num_kind(dtype) + if kind_num > st_kind_num: + if isinstance(dtype, WeakIntegralType): + return dpt.dtype(ti.default_device_int_type(dev)) + if isinstance(dtype, WeakComplexType): + if st_dtype is dpt.float16 or st_dtype is dpt.float32: + return st_dtype, dpt.complex64 + return _to_device_supported_dtype(dpt.complex128, dev) + return (_to_device_supported_dtype(dpt.float64, dev),) + else: + return st_dtype + else: + return dtype + + +def _check_clip_dtypes(res_dtype, arg1_dtype, arg2_dtype, sycl_dev): + "Checks if both types `arg1_dtype` and `arg2_dtype` can be" + "cast to `res_dtype` according to the rule `safe`" + if arg1_dtype == res_dtype and arg2_dtype == res_dtype: + return None, None, res_dtype + + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if _can_cast(arg1_dtype, res_dtype, _fp16, _fp64) and _can_cast( + arg2_dtype, res_dtype, _fp16, _fp64 + ): + # prevent unnecessary casting + ret_buf1_dt = None if res_dtype == arg1_dtype else res_dtype + ret_buf2_dt = None if res_dtype == arg2_dtype else res_dtype + return ret_buf1_dt, ret_buf2_dt, res_dtype + else: + return None, None, None + + +def _clip_none(x, val, out, order, _binary_fn): + if order not in ["K", "C", "F", "A"]: + order = "K" + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, val_usm_type = _get_queue_usm_type(val) + if q2 is None: + exec_q = q1 + res_usm_type = x_usm_type + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + val_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + val_shape = _get_shape(val) + if not isinstance(val_shape, (tuple, list)): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + val_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape} and {val_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + val_dtype = _get_dtype(val, sycl_dev) + if not _validate_dtype(val_dtype): + raise ValueError("Operands have unsupported data types") + + val_dtype = _resolve_one_strong_one_weak_types(x_dtype, val_dtype, sycl_dev) + + res_dt = x.dtype + _fp16 = sycl_dev.has_aspect_fp16 + _fp64 = sycl_dev.has_aspect_fp64 + if not _can_cast(val_dtype, res_dt, _fp16, _fp64): + raise ValueError( + f"function 'clip' does not support input types " + f"({x_dtype}, {val_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + f"output array must be of usm_ndarray type, got {type(out)}" + ) + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are inconsistent. " + f"Expected output shape is {res_shape}, got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, got {out.dtype}" + ) + + if dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) is None: + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + if ( + ti._array_overlap(val, out) + and not ti._same_logical_tensors(val, out) + and val_dtype == res_dt + ): + out = dpt.empty_like(out) + + if isinstance(val, dpt.usm_ndarray): + val_ary = val + else: + val_ary = dpt.asarray(val, dtype=val_dtype, sycl_queue=exec_q) + + if val_dtype == res_dt: + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, val_ary, res_dt, res_shape, res_usm_type, exec_q + ) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + val_ary, + ) + ) + else "C" + ) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if val_ary.shape != res_shape: + val_ary = dpt.broadcast_to(val_ary, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, src2=val_ary, dst=out, sycl_queue=exec_q + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_binary_ev.wait() + return out + else: + if order == "K": + buf = _empty_like_orderK(val_ary, res_dt) + else: + if order == "A": + order = "F" if x.flags.f_contiguous else "C" + buf = dpt.empty_like(val_ary, dtype=res_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=val_ary, dst=buf, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_pair_orderK( + x, buf, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + buf = dpt.broadcast_to(buf, res_shape) + ht_binary_ev, binary_ev = _binary_fn( + src1=x, + src2=buf, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + +# need to handle logic for min or max being None +def clip(x, min=None, max=None, out=None, order="K"): + """clip(x, min, max, out=None, order="K") + + Clips to the range [`min_i`, `max_i`] for each element `x_i` + in `x`. + + Args: + x (usm_ndarray): Array containing elements to clip. + Must be compatible with `min` and `max` according + to broadcasting rules. + min ({None, usm_ndarray}, optional): Array containing minimum values. + Must be compatible with `x` and `max` according + to broadcasting rules. + Only one of `min` and `max` can be `None`. + max ({None, usm_ndarray}, optional): Array containing maximum values. + Must be compatible with `x` and `min` according + to broadcasting rules. + Only one of `min` and `max` can be `None`. + out ({None, usm_ndarray}, optional): + Output array to populate. + Array must have the correct shape and the expected data type. + order ("C","F","A","K", optional): + Memory layout of the newly output array, if parameter `out` is + `None`. + Default: "K". + + Returns: + usm_ndarray: + An array with elements clipped to the range [`min`, `max`]. + The returned array has the same data type as `x`. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + "Expected `x` to be of dpctl.tensor.usm_ndarray type, got " + f"{type(x)}" + ) + if min is None and max is None: + raise ValueError( + "only one of `min` and `max` is permitted to be `None`" + ) + elif max is None: + return _clip_none(x, min, out, order, ti._maximum) + elif min is None: + return _clip_none(x, max, out, order, ti._minimum) + else: + q1, x_usm_type = x.sycl_queue, x.usm_type + q2, min_usm_type = _get_queue_usm_type(min) + q3, max_usm_type = _get_queue_usm_type(max) + if q2 is None and q3 is None: + exec_q = q1 + res_usm_type = x_usm_type + elif q3 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q2)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + ) + ) + elif q2 is None: + exec_q = dpctl.utils.get_execution_queue((q1, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + max_usm_type, + ) + ) + else: + exec_q = dpctl.utils.get_execution_queue((q1, q2, q3)) + if exec_q is None: + raise ExecutionPlacementError( + "Execution placement can not be unambiguously inferred " + "from input arguments." + ) + res_usm_type = dpctl.utils.get_coerced_usm_type( + ( + x_usm_type, + min_usm_type, + max_usm_type, + ) + ) + dpctl.utils.validate_usm_type(res_usm_type, allow_none=False) + x_shape = x.shape + min_shape = _get_shape(min) + max_shape = _get_shape(max) + if not all( + isinstance(s, (tuple, list)) + for s in ( + min_shape, + max_shape, + ) + ): + raise TypeError( + "Shape of arguments can not be inferred. " + "Arguments are expected to be " + "lists, tuples, or both" + ) + try: + res_shape = _broadcast_shape_impl( + [ + x_shape, + min_shape, + max_shape, + ] + ) + except ValueError: + raise ValueError( + "operands could not be broadcast together with shapes " + f"{x_shape}, {min_shape}, and {max_shape}" + ) + sycl_dev = exec_q.sycl_device + x_dtype = x.dtype + min_dtype = _get_dtype(min, sycl_dev) + max_dtype = _get_dtype(max, sycl_dev) + if not all(_validate_dtype(o) for o in (min_dtype, max_dtype)): + raise ValueError("Operands have unsupported data types") + + min_dtype, max_dtype = _resolve_one_strong_two_weak_types( + x_dtype, min_dtype, max_dtype, sycl_dev + ) + + buf1_dt, buf2_dt, res_dt = _check_clip_dtypes( + x_dtype, + min_dtype, + max_dtype, + sycl_dev, + ) + + if res_dt is None: + raise ValueError( + f"function '{clip}' does not support input types " + f"({x_dtype}, {min_dtype}, {max_dtype}), " + "and the inputs could not be safely coerced to any " + "supported types according to the casting rule ''safe''." + ) + + orig_out = out + if out is not None: + if not isinstance(out, dpt.usm_ndarray): + raise TypeError( + "output array must be of usm_ndarray type, got " + f"{type(out)}" + ) + + if out.shape != res_shape: + raise ValueError( + "The shape of input and output arrays are " + f"inconsistent. Expected output shape is {res_shape}, " + f"got {out.shape}" + ) + + if res_dt != out.dtype: + raise ValueError( + f"Output array of type {res_dt} is needed, " + f"got {out.dtype}" + ) + + if ( + dpctl.utils.get_execution_queue((exec_q, out.sycl_queue)) + is None + ): + raise ExecutionPlacementError( + "Input and output allocation queues are not compatible" + ) + + if ti._array_overlap(x, out): + if not ti._same_logical_tensors(x, out): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + if ( + ti._array_overlap(min, out) + and not ti._same_logical_tensors(min, out) + and buf1_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(max, dpt.usm_ndarray): + if ( + ti._array_overlap(max, out) + and not ti._same_logical_tensors(max, out) + and buf2_dt is None + ): + out = dpt.empty_like(out) + + if isinstance(min, dpt.usm_ndarray): + a_min = min + else: + a_min = dpt.asarray(min, dtype=min_dtype, sycl_queue=exec_q) + if isinstance(max, dpt.usm_ndarray): + a_max = max + else: + a_max = dpt.asarray(max, dtype=max_dtype, sycl_queue=exec_q) + + if buf1_dt is None and buf2_dt is None: + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + a_max, + ) + ) + else "C" + ) + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + if x_shape != res_shape: + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, min=a_min, max=a_max, dst=out, sycl_queue=exec_q + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_binary_ev.wait() + return out + + elif buf1_dt is None: + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_min, + ) + ) + else "C" + ) + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + a_min, + buf2, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + if a_min.shape != res_shape: + a_min = dpt.broadcast_to(a_min, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=a_min, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + elif buf2_dt is None: + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + if order == "A": + order = ( + "F" + if all( + arr.flags.f_contiguous + for arr in ( + x, + a_max, + ) + ) + else "C" + ) + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + ht_copy_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, + buf1, + a_max, + res_dt, + res_shape, + res_usm_type, + exec_q, + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + if a_max.shape != res_shape: + a_max = dpt.broadcast_to(a_max, res_shape) + ht_binary_ev, binary_ev = ti._clip( + src=x, + min=buf1, + max=a_max, + dst=out, + sycl_queue=exec_q, + depends=[copy_ev], + ) + if not (orig_out is None or orig_out is out): + # Copy the out data from temporary buffer to original memory + ht_copy_out_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=out, + dst=orig_out, + sycl_queue=exec_q, + depends=[binary_ev], + ) + ht_copy_out_ev.wait() + out = orig_out + ht_copy_ev.wait() + ht_binary_ev.wait() + return out + + if order in ["K", "A"]: + if ( + x.flags.f_contiguous + and a_min.flags.f_contiguous + and a_max.flags.f_contiguous + ): + order = "F" + elif ( + x.flags.c_contiguous + and a_min.flags.c_contiguous + and a_max.flags.c_contiguous + ): + order = "C" + else: + order = "C" if order == "A" else "K" + if order == "K": + buf1 = _empty_like_orderK(a_min, buf1_dt) + else: + buf1 = dpt.empty_like(a_min, dtype=buf1_dt, order=order) + ht_copy1_ev, copy1_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_min, dst=buf1, sycl_queue=exec_q + ) + if order == "K": + buf2 = _empty_like_orderK(a_max, buf2_dt) + else: + buf2 = dpt.empty_like(a_max, dtype=buf2_dt, order=order) + ht_copy2_ev, copy2_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_max, dst=buf2, sycl_queue=exec_q + ) + if out is None: + if order == "K": + out = _empty_like_triple_orderK( + x, buf1, buf2, res_dt, res_shape, res_usm_type, exec_q + ) + else: + out = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + order=order, + ) + + x = dpt.broadcast_to(x, res_shape) + buf1 = dpt.broadcast_to(buf1, res_shape) + buf2 = dpt.broadcast_to(buf2, res_shape) + ht_, _ = ti._clip( + src=x, + min=buf1, + max=buf2, + dst=out, + sycl_queue=exec_q, + depends=[copy1_ev, copy2_ev], + ) + dpctl.SyclEvent.wait_for([ht_copy1_ev, ht_copy2_ev, ht_]) + return out diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index fca5b0734a..baaac078b5 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -649,12 +649,7 @@ def __call__(self, o1, o2, out=None, order="K"): sycl_queue=exec_q, order=order, ) - else: - if res_dt != out.dtype: - raise TypeError( - f"Output array of type {res_dt} is needed," - f"got {out.dtype}" - ) + if src1.shape != res_shape: src1 = dpt.broadcast_to(src1, res_shape) buf2 = dpt.broadcast_to(buf2, res_shape) diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp new file mode 100644 index 0000000000..9cca9f615b --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -0,0 +1,311 @@ +//=== clip.hpp - Implementation of clip kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for dpctl.tensor.clip. +//===----------------------------------------------------------------------===// + +#pragma once +#include "pybind11/numpy.h" +#include "pybind11/stl.h" +#include +#include +#include +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace clip +{ + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +using namespace dpctl::tensor::offset_utils; + +template T clip(const T &x, const T &min, const T &max) +{ + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::max_complex; + using dpctl::tensor::math_utils::min_complex; + return min_complex(max_complex(x, min), max); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + auto tmp = (std::isnan(x) || x > min) ? x : min; + return (std::isnan(tmp) || tmp < max) ? tmp : max; + } + else if constexpr (std::is_same_v) { + return (x || min) && max; + } + else { + auto tmp = (x > min) ? x : min; + return (tmp < max) ? tmp : max; + } +} + +template class ClipContigFunctor +{ +private: + size_t nelems = 0; + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + +public: + ClipContigFunctor(size_t nelems_, + const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_) + : nelems(nelems_), x_p(x_p_), min_p(min_p_), max_p(max_p_), + dst_p(dst_p_) + { + } + + void operator()(sycl::nd_item<1> ndit) const + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0]; + size_t base = ndit.get_global_linear_id(); + + base = (base / sgSize) * sgSize * n_vecs * vec_sz + (base % sgSize); + for (size_t offset = base; + offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz)); + offset += sgSize) + { + dst_p[offset] = clip(x_p[offset], min_p[offset], max_p[offset]); + } + } + else { + auto sg = ndit.get_sub_group(); + std::uint8_t sgSize = sg.get_local_range()[0]; + std::uint8_t max_sgSize = sg.get_max_local_range()[0]; + size_t base = n_vecs * vec_sz * + (ndit.get_group(0) * ndit.get_local_range(0) + + sg.get_group_id()[0] * max_sgSize); + + if (base + n_vecs * vec_sz * sgSize < nelems && + sgSize == max_sgSize) { + sycl::vec x_vec; + sycl::vec min_vec; + sycl::vec max_vec; + sycl::vec dst_vec; +#pragma unroll + for (std::uint8_t it = 0; it < n_vecs * vec_sz; it += vec_sz) { + auto idx = base + it * sgSize; + auto x_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&x_p[idx]); + auto min_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&min_p[idx]); + auto max_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&max_p[idx]); + auto dst_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&dst_p[idx]); + + x_vec = sg.load(x_multi_ptr); + min_vec = sg.load(min_multi_ptr); + max_vec = sg.load(max_multi_ptr); +#pragma unroll + for (std::uint8_t vec_id = 0; vec_id < vec_sz; ++vec_id) { + dst_vec[vec_id] = clip(x_vec[vec_id], min_vec[vec_id], + max_vec[vec_id]); + } + sg.store(dst_multi_ptr, dst_vec); + } + } + else { + for (size_t k = base + sg.get_local_id()[0]; k < nelems; + k += sgSize) { + dst_p[k] = clip(x_p[k], min_p[k], max_p[k]); + } + } + } + } +}; + +template class clip_contig_kernel; + +typedef sycl::event (*clip_contig_impl_fn_ptr_t)( + sycl::queue &, + size_t, + const char *, + const char *, + const char *, + char *, + const std::vector &); + +template +sycl::event clip_contig_impl(sycl::queue &q, + size_t nelems, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + size_t lws = 64; + constexpr unsigned int vec_sz = 4; + constexpr unsigned int n_vecs = 2; + const size_t n_groups = + ((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz)); + const auto gws_range = sycl::range<1>(n_groups * lws); + const auto lws_range = sycl::range<1>(lws); + + cgh.parallel_for>( + sycl::nd_range<1>(gws_range, lws_range), + ClipContigFunctor(nelems, x_tp, min_tp, max_tp, + dst_tp)); + }); + + return clip_ev; +} + +template class ClipStridedFunctor +{ +private: + const T *x_p = nullptr; + const T *min_p = nullptr; + const T *max_p = nullptr; + T *dst_p = nullptr; + IndexerT indexer; + +public: + ClipStridedFunctor(const T *x_p_, + const T *min_p_, + const T *max_p_, + T *dst_p_, + IndexerT indexer_) + : x_p(x_p_), min_p(min_p_), max_p(max_p_), dst_p(dst_p_), + indexer(indexer_) + { + } + + void operator()(sycl::id<1> id) const + { + size_t gid = id[0]; + auto offsets = indexer(static_cast(gid)); + dst_p[offsets.get_fourth_offset()] = clip( + x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()], + max_p[offsets.get_third_offset()]); + } +}; + +template class clip_strided_kernel; + +typedef sycl::event (*clip_strided_impl_fn_ptr_t)( + sycl::queue &, + size_t, + int, + const char *, + const char *, + const char *, + char *, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event clip_strided_impl(sycl::queue &q, + size_t nelems, + int nd, + const char *x_cp, + const char *min_cp, + const char *max_cp, + char *dst_cp, + const py::ssize_t *shape_strides, + py::ssize_t x_offset, + py::ssize_t min_offset, + py::ssize_t max_offset, + py::ssize_t dst_offset, + const std::vector &depends) +{ + const T *x_tp = reinterpret_cast(x_cp); + const T *min_tp = reinterpret_cast(min_cp); + const T *max_tp = reinterpret_cast(max_cp); + T *dst_tp = reinterpret_cast(dst_cp); + + sycl::event clip_ev = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + FourOffsets_StridedIndexer indexer{ + nd, x_offset, min_offset, max_offset, dst_offset, shape_strides}; + + cgh.parallel_for>( + sycl::range<1>(nelems), + ClipStridedFunctor( + x_tp, min_tp, max_tp, dst_tp, indexer)); + }); + + return clip_ev; +} + +template struct ClipStridedFactory +{ + fnT get() + { + fnT fn = clip_strided_impl; + return fn; + } +}; + +template struct ClipContigFactory +{ + fnT get() + { + + fnT fn = clip_contig_impl; + return fn; + } +}; + +} // namespace clip +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp new file mode 100644 index 0000000000..ac494c19ae --- /dev/null +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -0,0 +1,269 @@ +//===-- clip.cpp - Implementation of clip --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines Python API for implementation functions of +/// dpctl.tensor.clip +//===----------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include +#include +#include + +#include "clip.hpp" +#include "kernels/clip.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +using dpctl::tensor::kernels::clip::clip_contig_impl_fn_ptr_t; +using dpctl::tensor::kernels::clip::clip_strided_impl_fn_ptr_t; + +static clip_contig_impl_fn_ptr_t clip_contig_dispatch_vector[td_ns::num_types]; +static clip_strided_impl_fn_ptr_t + clip_strided_dispatch_vector[td_ns::num_types]; + +void init_clip_dispatch_vectors(void) +{ + using namespace td_ns; + using dpctl::tensor::kernels::clip::ClipContigFactory; + DispatchVectorBuilder + dvb1; + dvb1.populate_dispatch_vector(clip_contig_dispatch_vector); + + using dpctl::tensor::kernels::clip::ClipStridedFactory; + DispatchVectorBuilder + dvb2; + dvb2.populate_dispatch_vector(clip_strided_dispatch_vector); +} + +using dpctl::utils::keep_args_alive; + +std::pair +py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends) +{ + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, min, max, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + int nd = src.get_ndim(); + int min_nd = min.get_ndim(); + int max_nd = max.get_ndim(); + int dst_nd = dst.get_ndim(); + + if (nd != min_nd || nd != max_nd) { + throw py::value_error( + "Input arrays are not of appropriate dimension for clip kernel."); + } + + if (nd != dst_nd) { + throw py::value_error( + "Destination is not of appropriate dimension for clip kernel."); + } + + const py::ssize_t *src_shape = src.get_shape_raw(); + const py::ssize_t *min_shape = min.get_shape_raw(); + const py::ssize_t *max_shape = max.get_shape_raw(); + const py::ssize_t *dst_shape = dst.get_shape_raw(); + + bool shapes_equal(true); + size_t nelems(1); + for (int i = 0; i < nd; ++i) { + const auto &sh_i = dst_shape[i]; + nelems *= static_cast(sh_i); + shapes_equal = shapes_equal && (min_shape[i] == sh_i) && + (max_shape[i] == sh_i) && (src_shape[i] == sh_i); + } + + if (!shapes_equal) { + throw py::value_error("Arrays are not of matching shapes."); + } + + if (nelems == 0) { + return std::make_pair(sycl::event{}, sycl::event{}); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + auto const &same_logical_tensors = + dpctl::tensor::overlap::SameLogicalTensors(); + if ((overlap(dst, src) && !same_logical_tensors(dst, src)) || + (overlap(dst, min) && !same_logical_tensors(dst, min)) || + (overlap(dst, max) && !same_logical_tensors(dst, max))) + { + throw py::value_error("Destination array overlaps with input."); + } + + int min_typenum = min.get_typenum(); + int max_typenum = max.get_typenum(); + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + auto const &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int min_typeid = array_types.typenum_to_lookup_id(min_typenum); + int max_typeid = array_types.typenum_to_lookup_id(max_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + if (src_typeid != dst_typeid || src_typeid != min_typeid || + src_typeid != max_typeid) + { + throw py::value_error("Input, min, max, and destination arrays must " + "have the same data type"); + } + + // ensure that dst is sufficiently ample + auto dst_offsets = dst.get_minmax_offsets(); + // destination must be ample enough to accommodate all elements + { + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < static_cast(nelems)) { + throw py::value_error( + "Memory addressed by the destination array can not " + "accommodate all the " + "array elements."); + } + } + + char *src_data = src.get_data(); + char *min_data = min.get_data(); + char *max_data = max.get_data(); + char *dst_data = dst.get_data(); + + bool is_min_c_contig = min.is_c_contiguous(); + bool is_min_f_contig = min.is_f_contiguous(); + + bool is_max_c_contig = max.is_c_contiguous(); + bool is_max_f_contig = max.is_f_contiguous(); + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_dst_f_contig = dst.is_f_contiguous(); + + bool all_c_contig = (is_min_c_contig && is_max_c_contig && + is_src_c_contig && is_dst_c_contig); + bool all_f_contig = (is_min_f_contig && is_max_f_contig && + is_src_f_contig && is_dst_f_contig); + + if (all_c_contig || all_f_contig) { + auto fn = clip_contig_dispatch_vector[src_typeid]; + + sycl::event clip_ev = + fn(exec_q, nelems, src_data, min_data, max_data, dst_data, depends); + sycl::event ht_ev = + keep_args_alive(exec_q, {src, min, max, dst}, {clip_ev}); + + return std::make_pair(ht_ev, clip_ev); + } + + auto const &src_strides = src.get_strides_vector(); + auto const &min_strides = min.get_strides_vector(); + auto const &max_strides = max.get_strides_vector(); + auto const &dst_strides = dst.get_strides_vector(); + + using shT = std::vector; + shT simplified_shape; + shT simplified_src_strides; + shT simplified_min_strides; + shT simplified_max_strides; + shT simplified_dst_strides; + py::ssize_t src_offset(0); + py::ssize_t min_offset(0); + py::ssize_t max_offset(0); + py::ssize_t dst_offset(0); + + dpctl::tensor::py_internal::simplify_iteration_space_4( + nd, src_shape, src_strides, min_strides, max_strides, dst_strides, + // outputs + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides, src_offset, min_offset, + max_offset, dst_offset); + + auto fn = clip_strided_dispatch_vector[src_typeid]; + + std::vector host_task_events; + host_task_events.reserve(2); + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + auto ptr_size_event_tuple = device_allocate_and_pack( + exec_q, host_task_events, + // common shape and strides + simplified_shape, simplified_src_strides, simplified_min_strides, + simplified_max_strides, simplified_dst_strides); + py::ssize_t *packed_shape_strides = std::get<0>(ptr_size_event_tuple); + sycl::event copy_shape_strides_ev = std::get<2>(ptr_size_event_tuple); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.insert(all_deps.end(), depends.begin(), depends.end()); + all_deps.push_back(copy_shape_strides_ev); + + assert(all_deps.size() == depends.size() + 1); + + sycl::event clip_ev = fn(exec_q, nelems, nd, src_data, min_data, max_data, + dst_data, packed_shape_strides, src_offset, + min_offset, max_offset, dst_offset, all_deps); + + // free packed temporaries + sycl::event temporaries_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(clip_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([packed_shape_strides, ctx]() { + sycl::free(packed_shape_strides, ctx); + }); + }); + + host_task_events.push_back(temporaries_cleanup_ev); + + sycl::event arg_cleanup_ev = + keep_args_alive(exec_q, {src, min, max, dst}, host_task_events); + + return std::make_pair(arg_cleanup_ev, clip_ev); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/clip.hpp b/dpctl/tensor/libtensor/source/clip.hpp new file mode 100644 index 0000000000..d4b8af2cf5 --- /dev/null +++ b/dpctl/tensor/libtensor/source/clip.hpp @@ -0,0 +1,52 @@ +//===-- clip.hpp - --*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file declares Python API for implementation functions of +/// dpctl.tensor.clip +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "dpctl4pybind11.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern std::pair +py_clip(const dpctl::tensor::usm_ndarray &src, + const dpctl::tensor::usm_ndarray &min, + const dpctl::tensor::usm_ndarray &max, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends); + +extern void init_clip_dispatch_vectors(void); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 282aecc95d..254856ec38 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -37,6 +37,7 @@ #include "accumulators.hpp" #include "boolean_advanced_indexing.hpp" #include "boolean_reductions.hpp" +#include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" #include "copy_for_roll.hpp" @@ -116,6 +117,9 @@ using dpctl::tensor::py_internal::usm_ndarray_triul; using dpctl::tensor::py_internal::py_where; +/* =========================== Clip ============================== */ +using dpctl::tensor::py_internal::py_clip; + // populate dispatch tables void init_dispatch_tables(void) { @@ -148,6 +152,8 @@ void init_dispatch_vectors(void) populate_cumsum_1d_dispatch_vectors(); init_repeat_dispatch_vectors(); + init_clip_dispatch_vectors(); + return; } @@ -441,6 +447,14 @@ PYBIND11_MODULE(_tensor_impl, m) py::arg("reps"), py::arg("axis"), py::arg("sycl_queue"), py::arg("depends") = py::list()); + m.def("_clip", &py_clip, + "Clamps elements of array `x` to the range " + "[`min`, `max] and writes the result to the " + "array `dst` for each element of `x`, `min`, and `max`." + "Returns a tuple of events: (hev, ev)", + py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + dpctl::tensor::py_internal::init_elementwise_functions(m); dpctl::tensor::py_internal::init_boolean_reduction_functions(m); dpctl::tensor::py_internal::init_reduction_functions(m); diff --git a/dpctl/tests/test_tensor_clip.py b/dpctl/tests/test_tensor_clip.py new file mode 100644 index 0000000000..7050b17e7c --- /dev/null +++ b/dpctl/tests/test_tensor_clip.py @@ -0,0 +1,627 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest +from helper import get_queue_or_skip, skip_if_dtype_not_supported +from numpy.testing import assert_raises_regex + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._type_utils import _can_cast +from dpctl.utils import ExecutionPlacementError + +_all_dtypes = [ + "?", + "u1", + "i1", + "u2", + "i2", + "u4", + "i4", + "u8", + "i8", + "e", + "f", + "d", + "F", + "D", +] + +_usm_types = ["device", "shared", "host"] + + +@pytest.mark.parametrize("dt1", _all_dtypes) +@pytest.mark.parametrize("dt2", _all_dtypes) +def test_clip_dtypes(dt1, dt2): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt1, q) + skip_if_dtype_not_supported(dt2, q) + + sz = 127 + ar1 = dpt.ones(sz, dtype=dt1, sycl_queue=q) + ar2 = dpt.ones_like(ar1, dtype=dt1, sycl_queue=q) + ar3 = dpt.ones_like(ar1, dtype=dt2, sycl_queue=q) + + dev = q.sycl_device + _fp16 = dev.has_aspect_fp16 + _fp64 = dev.has_aspect_fp64 + # also covers cases where dt1 == dt2 + if _can_cast(ar3.dtype, ar1.dtype, _fp16, _fp64): + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=ar3, max=None) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + + r = dpt.clip(ar1, min=None, max=ar3) + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == ar1.dtype + assert r.shape == ar1.shape + assert dpt.all(r == ar1) + assert r.sycl_queue == ar1.sycl_queue + else: + with pytest.raises(ValueError): + dpt.clip(ar1, ar2, ar3) + with pytest.raises(ValueError): + dpt.clip(ar1, min=ar3, max=None) + with pytest.raises(ValueError): + dpt.clip(ar1, min=None, max=ar3) + + +def test_clip_empty(): + get_queue_or_skip() + + x = dpt.empty((2, 0, 3), dtype="i4") + a_min = dpt.ones((2, 0, 3), dtype="i4") + a_max = dpt.ones((2, 0, 3), dtype="i4") + + r = dpt.clip(x, a_min, a_max) + assert r.size == 0 + assert r.shape == x.shape + + +def test_clip_python_scalars(): + get_queue_or_skip() + + arrs = [ + dpt.ones(1, dtype="?"), + dpt.ones(1, dtype="i4"), + dpt.ones(1, dtype="f4"), + dpt.ones(1, dtype="c8"), + ] + + py_zeros = [ + False, + 0, + 0.0, + complex(0, 0), + ] + + py_ones = [ + True, + 1, + 1.0, + complex(1, 0), + ] + + for zero, one, arr in zip(py_zeros, py_ones, arrs): + r = dpt.clip(arr, zero, one) + assert isinstance(r, dpt.usm_ndarray) + r = dpt.clip(arr, min=zero) + assert isinstance(r, dpt.usm_ndarray) + + +def test_clip_in_place(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="i4") + a_min = dpt.arange(1, 11, dtype="i4") + a_max = dpt.arange(2, 12, dtype="i4") + dpt.clip(x, a_min, a_max, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=x) + assert dpt.all(x == a_min) + + x = dpt.arange(10, dtype="i4") + dpt.clip(x, a_min, a_max, out=a_max) + assert dpt.all(a_max == a_min) + + a_min = dpt.arange(1, 11, dtype="i4") + dpt.clip(x, min=a_min, max=None, out=a_min[::-1]) + assert dpt.all((x + 1)[::-1] == a_min) + + +def test_clip_special_cases(): + get_queue_or_skip() + + x = dpt.arange(10, dtype="f4") + r = dpt.clip(x, -dpt.inf, dpt.inf) + assert dpt.all(r == x) + r = dpt.clip(x, dpt.nan, dpt.inf) + assert dpt.all(dpt.isnan(r)) + r = dpt.clip(x, -dpt.inf, dpt.nan) + assert dpt.all(dpt.isnan(r)) + + +def test_clip_out_need_temporary(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i4") + a_max = dpt.asarray(3, dtype="i2") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i4") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.ones(10, dtype="i4") + a_min = dpt.asarray(2, dtype="i2") + a_max = dpt.asarray(3, dtype="i1") + dpt.clip(x[:6], 2, 3, out=x[-6:]) + assert dpt.all(x[:-6] == 1) and dpt.all(x[-6:] == 2) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i4") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + x = dpt.full(6, 3, dtype="i4") + a_min = dpt.full(10, 2, dtype="i4") + a_max = dpt.asarray(4, dtype="i2") + dpt.clip(x, min=a_min[:6], max=a_max, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_out_need_temporary_none(): + get_queue_or_skip() + + x = dpt.full(6, 3, dtype="i4") + # with min/max == None + a_min = dpt.full(10, 2, dtype="i4") + dpt.clip(x, min=a_min[:6], max=None, out=a_min[-6:]) + assert dpt.all(a_min[:-6] == 2) and dpt.all(a_min[-6:] == 3) + + +def test_clip_arg_validation(): + get_queue_or_skip() + + check = dict() + x1 = dpt.empty((1,), dtype="i4") + x2 = dpt.empty((1,), dtype="i4") + + with pytest.raises(TypeError): + dpt.clip(check, x1, x2) + + +@pytest.mark.parametrize( + "dt1,dt2", [("i4", "i4"), ("i4", "i2"), ("i2", "i4"), ("i1", "i2")] +) +def test_clip_order(dt1, dt2): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt1, order="C") + ar3 = dpt.ones(test_shape, dtype=dt2, order="C") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt1, order="F") + ar3 = dpt.ones(test_shape, dtype=dt2, order="F") + r1 = dpt.clip(ar1, ar2, ar3, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, ar2, ar3, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, ar2, ar3, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2] + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2] + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt1, order="C")[:20, ::-2].mT + ar3 = dpt.ones(test_shape2, dtype=dt2, order="C")[:20, ::-2].mT + r4 = dpt.clip(ar1, ar2, ar3, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, ar2, ar3, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("dt", ["i4", "i2"]) +def test_clip_none_order(dt): + get_queue_or_skip() + + test_shape = ( + 20, + 20, + ) + test_shape2 = tuple(2 * dim for dim in test_shape) + n = test_shape[-1] + + ar1 = dpt.ones(test_shape, dtype="i4", order="C") + ar2 = dpt.ones(test_shape, dtype=dt, order="C") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.c_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.c_contiguous + + ar1 = dpt.ones(test_shape, dtype="i4", order="F") + ar2 = dpt.ones(test_shape, dtype=dt, order="F") + + r1 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r1.flags.c_contiguous + r2 = dpt.clip(ar1, min=None, max=ar2, order="F") + assert r2.flags.f_contiguous + r3 = dpt.clip(ar1, min=None, max=ar2, order="A") + assert r3.flags.f_contiguous + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.flags.f_contiguous + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2] + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2] + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (n, -1) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + ar1 = dpt.ones(test_shape2, dtype="i4", order="C")[:20, ::-2].mT + ar2 = dpt.ones(test_shape2, dtype=dt, order="C")[:20, ::-2].mT + + r4 = dpt.clip(ar1, min=None, max=ar2, order="K") + assert r4.strides == (-1, n) + r5 = dpt.clip(ar1, min=None, max=ar2, order="C") + assert r5.strides == (n, 1) + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +@pytest.mark.parametrize("usm_type3", _usm_types) +def test_clip_usm_type_matrix(usm_type1, usm_type2, usm_type3): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + ar3 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type3) + + r = dpt.clip(ar1, ar2, ar3) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type( + (usm_type1, usm_type2, usm_type3) + ) + assert r.usm_type == expected_usm_type + + +@pytest.mark.parametrize("usm_type1", _usm_types) +@pytest.mark.parametrize("usm_type2", _usm_types) +def test_clip_usm_type_matrix_none_arg(usm_type1, usm_type2): + get_queue_or_skip() + + sz = 128 + ar1 = dpt.ones(sz, dtype="i4", usm_type=usm_type1) + ar2 = dpt.ones_like(ar1, dtype="i4", usm_type=usm_type2) + + r = dpt.clip(ar1, min=ar2, max=None) + assert isinstance(r, dpt.usm_ndarray) + expected_usm_type = dpctl.utils.get_coerced_usm_type((usm_type1, usm_type2)) + assert r.usm_type == expected_usm_type + + +def test_clip_dtype_error(): + get_queue_or_skip() + + ar1 = dpt.ones(1, dtype="i4") + ar2 = dpt.ones(1, dtype="i4") + ar3 = dpt.ones(1, dtype="i4") + ar4 = dpt.empty_like(ar1, dtype="f4") + + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + assert_raises_regex( + ValueError, + "Output array of type.*is needed", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_errors(): + get_queue_or_skip() + try: + gpu_queue = dpctl.SyclQueue("gpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('gpu') failed, skipping") + try: + cpu_queue = dpctl.SyclQueue("cpu") + except dpctl.SyclQueueCreationError: + pytest.skip("SyclQueue('cpu') failed, skipping") + + ar1 = dpt.ones(2, dtype="float32", sycl_queue=gpu_queue) + ar2 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar3 = dpt.ones_like(ar1, sycl_queue=gpu_queue) + ar4 = dpt.empty_like(ar1, sycl_queue=cpu_queue) + assert_raises_regex( + ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Input and output allocation queues are not compatible", + dpt.clip, + ar1, + None, + ar3, + ar4, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + ar2, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + 1, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + 1, + ar4, + ar3, + ) + + assert_raises_regex( + ExecutionPlacementError, + "Execution placement can not be unambiguously inferred from input " + "arguments.", + dpt.clip, + ar1, + ar4, + None, + ar2, + ) + + ar1 = dpt.ones(2, dtype="float32") + ar2 = dpt.ones_like(ar1, dtype="float32") + ar3 = dpt.ones_like(ar1, dtype="float32") + ar4 = dpt.empty(3, dtype="float32") + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + ValueError, + "The shape of input and output arrays are inconsistent", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + ar1 = np.ones(2, dtype="f4") + ar2 = dpt.ones(2, dtype="f4") + ar3 = dpt.ones(2, dtype="f4") + assert_raises_regex( + TypeError, + "Expected `x` to be of dpctl.tensor.usm_ndarray type*", + dpt.clip, + ar1, + ar2, + ar3, + ) + + ar1 = dpt.ones(2, dtype="i4") + ar2 = dpt.ones_like(ar1, dtype="i4") + ar3 = dpt.ones_like(ar1, dtype="i4") + ar4 = np.empty_like(ar1) + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + ar3, + ar4, + ) + + assert_raises_regex( + TypeError, + "output array must be of usm_ndarray type", + dpt.clip, + ar1, + ar2, + None, + ar4, + ) + + +def test_clip_out_type_check(): + get_queue_or_skip() + + x1 = dpt.ones(10) + x2 = dpt.ones(10) + x3 = dpt.ones(10) + + out = range(10) + + with pytest.raises(TypeError): + dpt.clip(x1, x2, x3, out=out) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_basic(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q) + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q) + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +@pytest.mark.parametrize("dt", ["i4", "f4", "c8"]) +def test_clip_strided(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + sz = 2 * 1026 + x = dpt.arange(sz, dtype=dt, sycl_queue=q)[::-2] + r = dpt.clip(x, min=100, max=500) + expected = dpt.arange(sz, dtype=dt, sycl_queue=q) + expected[:100] = 100 + expected[500:] = 500 + expected = expected[::-2] + assert dpt.all(expected == r) + + x = dpt.zeros(sz, dtype=dt, sycl_queue=q)[::-2] + a_max = dpt.full(sz, -1, dtype=dt, sycl_queue=q) + a_max[::2] = -2 + a_max = a_max[::-2] + r = dpt.clip(x, min=-3, max=a_max) + assert dpt.all(a_max == r) + + +def test_clip_max_less_than_min(): + get_queue_or_skip() + + x = dpt.ones(10, dtype="i4") + res = dpt.clip(x, 5, 0) + assert dpt.all(res == 0) + + +def test_clip_minmax_weak_types(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype=dpt.bool) + min_list = [False, 0, 0.0, 0.0 + 0.0j] + max_list = [True, 1, 1.0, 1.0 + 0.0j] + for min_v, max_v in zip(min_list, max_list): + if isinstance(min_v, bool) and isinstance(max_v, bool): + y = dpt.clip(x, min_v, max_v) + assert isinstance(y, dpt.usm_ndarray) + else: + with pytest.raises(ValueError): + dpt.clip(x, min_v, max_v) + + +def test_clip_max_weak_types(): + get_queue_or_skip() + + x = dpt.zeros(10, dtype="i4") + m = dpt.ones(10, dtype="i4") + + with pytest.raises(ValueError): + dpt.clip(x, m, 2.5) + + with pytest.raises(ValueError): + dpt.clip(x, 2.5, m) From 03fd73794070b80f7a41a94045704e1ef8721188 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Wed, 25 Oct 2023 18:48:14 -0700 Subject: [PATCH 54/83] Implements ``dpctl.tensor.logsumexp`` and ``dpctl.tensor.reduce_hypot`` (#1446) * Implements logsumexp and reduce_hypot * Implements dedicated kernels for temp reductions over axes 1 and 0 in contiguous arrays * logsumexp and reduce_hypot no longer use atomics This change was made to improve the accuracy of these functions * Adds tests for reduce_hypot and logsumexp * Arithmetic reductions no longer use atomics for inexact types This change is intended to improve the numerical stability of sum and prod * Removed support of atomic reduction for max and min * Adds new tests for reductions * Split reductions into multiple source files * Remove unneccessary imports of reduction init functions * Added functions for querying reduction atomic support per type per function * Corrected ``min`` contig variant typo These variants were using ``sycl::maximum`` rather than ``sycl::minimum`` * Removes _tree_reduction_over_axis Use lambdas to ignore atomic-specific arguments to hypot and logsumexp dtype_supported functions * Always use atomic implementation for min/max if available For add/multiplies reductions, use tree reduction for FP types, real and complex, to get better round-off accumulation properties. * ``logaddexp`` implementation moved to math_utils Reduces code repetition between logsumexp and logaddexp --------- Co-authored-by: Oleksandr Pavlyk --- dpctl/tensor/CMakeLists.txt | 16 +- dpctl/tensor/__init__.py | 13 +- dpctl/tensor/_reduction.py | 159 +- .../elementwise_functions/logaddexp.hpp | 27 +- .../libtensor/include/kernels/reductions.hpp | 4556 ++++++++++++----- .../libtensor/include/utils/math_utils.hpp | 20 + .../libtensor/include/utils/sycl_utils.hpp | 40 + .../libtensor/source/reduction_over_axis.cpp | 514 -- .../libtensor/source/reduction_over_axis.hpp | 689 --- .../libtensor/source/reductions/argmax.cpp | 119 + .../libtensor/source/reductions/argmax.hpp | 41 + .../libtensor/source/reductions/argmin.cpp | 119 + .../libtensor/source/reductions/argmin.hpp | 41 + .../libtensor/source/reductions/logsumexp.cpp | 136 + .../libtensor/source/reductions/logsumexp.hpp | 41 + .../libtensor/source/reductions/max.cpp | 171 + .../libtensor/source/reductions/max.hpp | 41 + .../libtensor/source/reductions/min.cpp | 173 + .../libtensor/source/reductions/min.hpp | 41 + .../libtensor/source/reductions/prod.cpp | 187 + .../libtensor/source/reductions/prod.hpp | 41 + .../source/reductions/reduce_hypot.cpp | 132 + .../source/reductions/reduce_hypot.hpp | 41 + .../reductions/reduction_atomic_support.hpp | 143 + .../source/reductions/reduction_common.cpp | 60 + .../source/reductions/reduction_common.hpp | 41 + .../source/reductions/reduction_over_axis.hpp | 1095 ++++ .../libtensor/source/reductions/sum.cpp | 187 + .../libtensor/source/reductions/sum.hpp | 41 + dpctl/tensor/libtensor/source/tensor_py.cpp | 2 +- dpctl/tests/test_tensor_sum.py | 15 + dpctl/tests/test_usm_ndarray_reductions.py | 195 + 32 files changed, 6735 insertions(+), 2402 deletions(-) delete mode 100644 dpctl/tensor/libtensor/source/reduction_over_axis.cpp delete mode 100644 dpctl/tensor/libtensor/source/reduction_over_axis.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/argmax.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/argmax.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/argmin.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/argmin.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/logsumexp.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/logsumexp.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/max.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/max.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/min.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/min.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/prod.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/prod.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_common.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_common.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp create mode 100644 dpctl/tensor/libtensor/source/reductions/sum.cpp create mode 100644 dpctl/tensor/libtensor/source/reductions/sum.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 5247b4953b..9c02a325bc 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -102,6 +102,17 @@ set(_elementwise_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/true_divide.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/trunc.cpp ) +set(_reduction_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduction_common.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmax.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/argmin.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/logsumexp.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/max.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/min.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/prod.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp +) set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp @@ -120,11 +131,11 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) list(APPEND _tensor_impl_sources ${_elementwise_sources} + ${_reduction_sources} ) set(python_module_name _tensor_impl) @@ -138,12 +149,13 @@ endif() set(_no_fast_math_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) list(APPEND _no_fast_math_sources ${_elementwise_sources} + ${_reduction_sources} ) + foreach(_src_fn ${_no_fast_math_sources}) get_source_file_property(_cmpl_options_prop ${_src_fn} COMPILE_OPTIONS) set(_combined_options_prop ${_cmpl_options_prop} "${_clang_prefix}-fno-fast-math") diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 209a6d4e56..5eee3e9ab9 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -165,7 +165,16 @@ tanh, trunc, ) -from ._reduction import argmax, argmin, max, min, prod, sum +from ._reduction import ( + argmax, + argmin, + logsumexp, + max, + min, + prod, + reduce_hypot, + sum, +) from ._testing import allclose __all__ = [ @@ -324,4 +333,6 @@ "copysign", "rsqrt", "clip", + "logsumexp", + "reduce_hypot", ] diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index aac1c84677..0edc9ac12b 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -52,6 +52,28 @@ def _default_reduction_dtype(inp_dt, q): return res_dt +def _default_reduction_dtype_fp_types(inp_dt, q): + """Gives default output data type for given input data + type `inp_dt` when reduction is performed on queue `q` + and the reduction supports only floating-point data types + """ + inp_kind = inp_dt.kind + if inp_kind in "biu": + res_dt = dpt.dtype(ti.default_device_fp_type(q)) + can_cast_v = dpt.can_cast(inp_dt, res_dt) + if not can_cast_v: + _fp64 = q.sycl_device.has_aspect_fp64 + res_dt = dpt.float64 if _fp64 else dpt.float32 + elif inp_kind in "f": + res_dt = dpt.dtype(ti.default_device_fp_type(q)) + if res_dt.itemsize < inp_dt.itemsize: + res_dt = inp_dt + elif inp_kind in "c": + raise TypeError("reduction not defined for complex types") + + return res_dt + + def _reduction_over_axis( x, axis, @@ -91,12 +113,15 @@ def _reduction_over_axis( res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) res_shape = tuple(res_shape[i] for i in inv_perm) - return dpt.full( - res_shape, - _identity, - dtype=res_dt, - usm_type=res_usm_type, - sycl_queue=q, + return dpt.astype( + dpt.full( + res_shape, + _identity, + dtype=_default_reduction_type_fn(inp_dt, q), + usm_type=res_usm_type, + sycl_queue=q, + ), + res_dt, ) if red_nd == 0: return dpt.astype(x, res_dt, copy=False) @@ -116,7 +141,7 @@ def _reduction_over_axis( "Automatically determined reduction data type does not " "have direct implementation" ) - tmp_dt = _default_reduction_dtype(inp_dt, q) + tmp_dt = _default_reduction_type_fn(inp_dt, q) tmp = dpt.empty( res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q ) @@ -161,13 +186,13 @@ def sum(x, axis=None, dtype=None, keepdims=False): the returned array will have the default real-valued floating-point data type for the device where input array `x` is allocated. - * If x` has signed integral data type, the returned array + * If `x` has signed integral data type, the returned array will have the default signed integral type for the device where input array `x` is allocated. * If `x` has unsigned integral data type, the returned array will have the default unsigned integral type for the device where input array `x` is allocated. - * If `x` has a complex-valued floating-point data typee, + * If `x` has a complex-valued floating-point data type, the returned array will have the default complex-valued floating-pointer data type for the device where input array `x` is allocated. @@ -222,13 +247,13 @@ def prod(x, axis=None, dtype=None, keepdims=False): the returned array will have the default real-valued floating-point data type for the device where input array `x` is allocated. - * If x` has signed integral data type, the returned array + * If `x` has signed integral data type, the returned array will have the default signed integral type for the device where input array `x` is allocated. * If `x` has unsigned integral data type, the returned array will have the default unsigned integral type for the device where input array `x` is allocated. - * If `x` has a complex-valued floating-point data typee, + * If `x` has a complex-valued floating-point data type, the returned array will have the default complex-valued floating-pointer data type for the device where input array `x` is allocated. @@ -263,6 +288,118 @@ def prod(x, axis=None, dtype=None, keepdims=False): ) +def logsumexp(x, axis=None, dtype=None, keepdims=False): + """logsumexp(x, axis=None, dtype=None, keepdims=False) + + Calculates the logarithm of the sum of exponentials of elements in the + input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If `None`, the result is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data type, + an error is raised. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the result. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + ti._logsumexp_over_axis, + lambda inp_dt, res_dt, *_: ti._logsumexp_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_reduction_dtype_fp_types, + _identity=-dpt.inf, + ) + + +def reduce_hypot(x, axis=None, dtype=None, keepdims=False): + """reduce_hypot(x, axis=None, dtype=None, keepdims=False) + + Calculates the square root of the sum of squares of elements in the input + array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which values must be computed. If a tuple + of unique integers, values are computed over multiple axes. + If `None`, the result is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data type, + an error is raised. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the result. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the results. If the result was computed over + the entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + ti._hypot_over_axis, + lambda inp_dt, res_dt, *_: ti._hypot_over_axis_dtype_supported( + inp_dt, res_dt + ), + _default_reduction_dtype_fp_types, + _identity=0, + ) + + def _comparison_over_axis(x, axis, keepdims, _reduction_fn): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index 90b7997a37..6a187da6f4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -31,6 +31,7 @@ #include #include +#include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" @@ -61,7 +62,8 @@ template struct LogAddExpFunctor resT operator()(const argT1 &in1, const argT2 &in2) const { - return impl(in1, in2); + using dpctl::tensor::math_utils::logaddexp; + return logaddexp(in1, in2); } template @@ -79,7 +81,8 @@ template struct LogAddExpFunctor impl_finite(-std::abs(diff[i])); } else { - res[i] = impl(in1[i], in2[i]); + using dpctl::tensor::math_utils::logaddexp; + res[i] = logaddexp(in1[i], in2[i]); } } @@ -87,26 +90,6 @@ template struct LogAddExpFunctor } private: - template T impl(T const &in1, T const &in2) const - { - if (in1 == in2) { // handle signed infinities - const T log2 = std::log(T(2)); - return in1 + log2; - } - else { - const T tmp = in1 - in2; - if (tmp > 0) { - return in1 + std::log1p(std::exp(-tmp)); - } - else if (tmp <= 0) { - return in2 + std::log1p(std::exp(tmp)); - } - else { - return std::numeric_limits::quiet_NaN(); - } - } - } - template T impl_finite(T const &in) const { return (in > 0) ? (in + std::log1p(std::exp(-in))) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 7cb97cd4f9..b9e2918c8c 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -685,7 +685,6 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; constexpr resTy identity_val = su_ns::Identity::value; - ; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -944,8 +943,103 @@ struct CustomReductionOverGroupNoAtomicFunctor } }; +typedef sycl::event (*reduction_strided_impl_fn_ptr)( + sycl::queue &, + size_t, + size_t, + const char *, + char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + const std::vector &); + +template +class reduction_over_group_temps_strided_krn; + +template +class custom_reduction_over_group_temps_strided_krn; + +template +class single_reduction_axis0_temps_contig_krn; + +template +class first_reduction_axis0_temps_contig_krn; + +template +class middle_reduction_axis0_temps_contig_krn; + +template +class final_reduction_axis0_temps_contig_krn; + +template +class single_custom_reduction_axis0_temps_contig_krn; + +template +class first_custom_reduction_axis0_temps_contig_krn; + +template +class middle_custom_reduction_axis0_temps_contig_krn; + +template +class final_custom_reduction_axis0_temps_contig_krn; + +template +class single_reduction_axis1_temps_contig_krn; + +template +class first_reduction_axis1_temps_contig_krn; + template -class reduction_over_group_temps_krn; +class middle_reduction_axis1_temps_contig_krn; + +template +class final_reduction_axis1_temps_contig_krn; + +template +class single_custom_reduction_axis1_temps_contig_krn; + +template +class first_custom_reduction_axis1_temps_contig_krn; template -class custom_reduction_over_group_temps_krn; +class middle_custom_reduction_axis1_temps_contig_krn; + +template +class final_custom_reduction_axis1_temps_contig_krn; template sycl::event reduction_over_group_temps_strided_impl( @@ -1020,7 +1122,7 @@ sycl::event reduction_over_group_temps_strided_impl( if constexpr (can_use_reduce_over_group::value) { - using KernelName = class reduction_over_group_temps_krn< + using KernelName = class reduction_over_group_temps_strided_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; @@ -1036,9 +1138,10 @@ sycl::event reduction_over_group_temps_strided_impl( else { using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); - using KernelName = class custom_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT, SlmT>; + using KernelName = + class custom_reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), @@ -1107,7 +1210,7 @@ sycl::event reduction_over_group_temps_strided_impl( if constexpr (can_use_reduce_over_group::value) { - using KernelName = class reduction_over_group_temps_krn< + using KernelName = class reduction_over_group_temps_strided_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; cgh.parallel_for( @@ -1123,9 +1226,10 @@ sycl::event reduction_over_group_temps_strided_impl( else { using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); - using KernelName = class custom_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT, SlmT>; + using KernelName = + class custom_reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), CustomReductionOverGroupNoAtomicFunctor< @@ -1180,9 +1284,10 @@ sycl::event reduction_over_group_temps_strided_impl( auto localRange = sycl::range<1>{wg}; if constexpr (can_use_reduce_over_group::value) { - using KernelName = class reduction_over_group_temps_krn< - resTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + using KernelName = + class reduction_over_group_temps_strided_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), ReductionOverGroupNoAtomicFunctor< @@ -1197,7 +1302,7 @@ sycl::event reduction_over_group_temps_strided_impl( using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); using KernelName = - class custom_reduction_over_group_temps_krn< + class custom_reduction_over_group_temps_strided_krn< resTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, SlmT>; cgh.parallel_for( @@ -1256,7 +1361,7 @@ sycl::event reduction_over_group_temps_strided_impl( if constexpr (can_use_reduce_over_group::value) { - using KernelName = class reduction_over_group_temps_krn< + using KernelName = class reduction_over_group_temps_strided_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; cgh.parallel_for( @@ -1272,9 +1377,10 @@ sycl::event reduction_over_group_temps_strided_impl( else { using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); - using KernelName = class custom_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT, SlmT>; + using KernelName = + class custom_reduction_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), CustomReductionOverGroupNoAtomicFunctor< @@ -1304,1220 +1410,3219 @@ sycl::event reduction_over_group_temps_strided_impl( } } -/* @brief Types supported by comparison-reduction code based on atomic_ref */ -template -struct TypePairSupportDataForCompReductionAtomic +template +sycl::event reduction_axis1_over_group_temps_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) { + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); - /* value if true a kernel for must be instantiated, false - * otherwise */ - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ - // input int32 - td_ns::TypePairDefinedEntry, - // input uint32 - td_ns::TypePairDefinedEntry, - // input int64 - td_ns::TypePairDefinedEntry, - // input uint64 - td_ns::TypePairDefinedEntry, - // input float - td_ns::TypePairDefinedEntry, - // input double - td_ns::TypePairDefinedEntry, - // fall-through - td_ns::NotDefinedEntry>::is_defined; -}; - -template -struct TypePairSupportDataForCompReductionTemps -{ + constexpr resTy identity_val = su_ns::Identity::value; - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - // input int8_t - td_ns::TypePairDefinedEntry, + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - // input uint8_t - td_ns::TypePairDefinedEntry, + constexpr size_t preferrered_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + constexpr size_t max_max_wg = 2048; + size_t max_wg = std::min( + max_max_wg, d.get_info()); - // input int16_t - td_ns::TypePairDefinedEntry, + size_t reductions_per_wi(preferrered_reductions_per_wi); + if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { + // reduction only requries 1 work-group, can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); - // input uint16_t - td_ns::TypePairDefinedEntry, + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; - // input int32_t - td_ns::TypePairDefinedEntry, - // input uint32_t - td_ns::TypePairDefinedEntry, + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; - // input int64_t - td_ns::TypePairDefinedEntry, + wg = max_wg; + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); - // input uint32_t - td_ns::TypePairDefinedEntry, + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); - // input half - td_ns::TypePairDefinedEntry, + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; - // input float - td_ns::TypePairDefinedEntry, + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class single_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - // input double - td_ns::TypePairDefinedEntry, + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class single_custom_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; - // input std::complex - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups > 1); - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, + size_t second_iter_reduction_groups_ = + (reduction_groups + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); - // fall-through - td_ns::NotDefinedEntry>::is_defined; -}; + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; -template -struct MaxOverAxisAtomicStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionAtomic< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_floating_point::value) { - using ReductionOpT = su_ns::Maximum; - return dpctl::tensor::kernels:: - reduction_over_group_with_atomics_strided_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - using ReductionOpT = sycl::maximum; - return dpctl::tensor::kernels:: - reduction_over_group_with_atomics_strided_impl< - srcTy, dstTy, ReductionOpT>; - } + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unabled to allocate device_memory"); } else { - return nullptr; + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; } - } -}; -template -struct MaxOverAxisTempsStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionTemps< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_integral_v && - !std::is_same_v) { - using ReductionOpT = sycl::maximum; - return dpctl::tensor::kernels:: - reduction_over_group_temps_strided_impl; - } - else { - using ReductionOpT = su_ns::Maximum; - return dpctl::tensor::kernels:: - reduction_over_group_temps_strided_impl; - } - } - else { - return nullptr; - } - } -}; + const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(depends); -template -struct MaxOverAxis1AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionAtomic< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_floating_point::value) { - using ReductionOpT = su_ns::Maximum; - return dpctl::tensor::kernels:: - reduction_axis1_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - using ReductionOpT = sycl::maximum; - return dpctl::tensor::kernels:: - reduction_axis1_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - } - else { - return nullptr; - } - } -}; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + RowsIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; -template -struct MaxOverAxis0AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionAtomic< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_floating_point::value) { - using ReductionOpT = su_ns::Maximum; - return dpctl::tensor::kernels:: - reduction_axis0_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - using ReductionOpT = sycl::maximum; - return dpctl::tensor::kernels:: - reduction_axis0_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - } - else { - return nullptr; - } - } -}; + RowsIndexerT rows_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_nelems)}; + NoOpIndexerT noop_tmp_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{}; -template -struct MinOverAxisAtomicStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionAtomic< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_floating_point::value) { - using ReductionOpT = su_ns::Minimum; - return dpctl::tensor::kernels:: - reduction_over_group_with_atomics_strided_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - using ReductionOpT = sycl::minimum; - return dpctl::tensor::kernels:: - reduction_over_group_with_atomics_strided_impl< - srcTy, dstTy, ReductionOpT>; - } - } - else { - return nullptr; - } - } -}; + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; -template -struct MinOverAxisTempsStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionTemps< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_integral_v && - !std::is_same_v) { - using ReductionOpT = sycl::minimum; - return dpctl::tensor::kernels:: - reduction_over_group_temps_strided_impl; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class first_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); } else { - using ReductionOpT = su_ns::Minimum; - return dpctl::tensor::kernels:: - reduction_over_group_temps_strided_impl; + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class first_custom_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); } - } - else { - return nullptr; - } - } -}; + }); -template -struct MinOverAxis1AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionAtomic< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_floating_point::value) { - using ReductionOpT = su_ns::Minimum; - return dpctl::tensor::kernels:: - reduction_axis1_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - using ReductionOpT = sycl::minimum; - return dpctl::tensor::kernels:: - reduction_axis1_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - } - else { - return nullptr; - } - } -}; + size_t remaining_reduction_nelems = reduction_groups; -template -struct MinOverAxis0AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForCompReductionAtomic< - srcTy, dstTy>::is_defined) - { - if constexpr (std::is_floating_point::value) { - using ReductionOpT = su_ns::Minimum; - return dpctl::tensor::kernels:: - reduction_axis0_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - using ReductionOpT = sycl::minimum; - return dpctl::tensor::kernels:: - reduction_axis0_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - } - else { - return nullptr; - } - } -}; - -// Sum - -/* @brief Types supported by plus-reduction code based on atomic_ref */ -template -struct TypePairSupportDataForSumReductionAtomic -{ - - /* value if true a kernel for must be instantiated, false - * otherwise */ - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input double - td_ns::TypePairDefinedEntry, - // fall-through - td_ns::NotDefinedEntry>::is_defined; -}; - -template -struct TypePairSupportDataForSumReductionTemps -{ - - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; - // input int8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + while (remaining_reduction_nelems > + preferrered_reductions_per_wi * max_wg) { + size_t reduction_groups_ = + (remaining_reduction_nelems + + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups_ > 1); - // input uint8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); - // input int16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; - // input uint16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; - // input int32_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; - // input uint32_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class middle_reduction_axis1_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + remaining_reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class middle_custom_reduction_axis1_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + local_memory, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); - // input int64_t - td_ns::TypePairDefinedEntry, + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } - // input uint32_t - td_ns::TypePairDefinedEntry, + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns:: - TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{}; - // input double - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; - // input std::complex - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); - // fall-throug - td_ns::NotDefinedEntry>::is_defined; -}; + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; -template -struct SumOverAxisAtomicStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionAtomic< - srcTy, dstTy>::is_defined) - { - using ReductionOpT = sycl::plus; - return dpctl::tensor::kernels:: - reduction_over_group_with_atomics_strided_impl; - } - else { - return nullptr; - } - } -}; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class final_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, + remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class final_custom_reduction_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); -template -struct SumOverAxisTempsStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionTemps< - srcTy, dstTy>::is_defined) { - using ReductionOpT = sycl::plus; - return dpctl::tensor::kernels:: - reduction_over_group_temps_strided_impl; - } - else { - return nullptr; - } - } -}; + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + const sycl::context &ctx = exec_q.get_context(); -template -struct SumOverAxis1AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionAtomic< - srcTy, dstTy>::is_defined) - { - using ReductionOpT = sycl::plus; - return dpctl::tensor::kernels:: - reduction_axis1_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - return nullptr; - } + cgh.host_task([ctx, partially_reduced_tmp] { + sycl::free(partially_reduced_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; } -}; +} -template -struct SumOverAxis0AtomicContigFactory +template +sycl::event reduction_axis0_over_group_temps_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) { - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionAtomic< - srcTy, dstTy>::is_defined) - { - using ReductionOpT = sycl::plus; - return dpctl::tensor::kernels:: - reduction_axis0_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - return nullptr; - } - } -}; + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); -// Product + constexpr resTy identity_val = su_ns::Identity::value; -/* @brief Types supported by plus-reduction code based on atomic_ref */ -template -struct TypePairSupportDataForProductReductionAtomic -{ + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - /* value if true a kernel for must be instantiated, false - * otherwise */ - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input double - td_ns::TypePairDefinedEntry, - // fall-through - td_ns::NotDefinedEntry>::is_defined; -}; + constexpr size_t preferrered_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + constexpr size_t max_max_wg = 2048; + size_t max_wg = std::min( + max_max_wg, d.get_info()); -template -struct TypePairSupportDataForProductReductionTemps -{ + size_t reductions_per_wi(preferrered_reductions_per_wi); + if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { + // reduction only requries 1 work-group, can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; - // input int8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; - // input uint8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + wg = max_wg; + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); - // input int16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); - // input uint16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; - // input int32_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = + class single_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - // input uint32_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class single_custom_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; - // input int64_t - td_ns::TypePairDefinedEntry, + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups > 1); - // input uint32_t - td_ns::TypePairDefinedEntry, + size_t second_iter_reduction_groups_ = + (reduction_groups + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns:: - TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; - // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unabled to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } - // input double - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, + const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(depends); - // input std::complex - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, + NoOpIndexerT columns_indexer{}; + NoOpIndexerT noop_tmp_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; - // fall-throug - td_ns::NotDefinedEntry>::is_defined; -}; + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; -template -struct ProductOverAxisAtomicStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForProductReductionAtomic< - srcTy, dstTy>::is_defined) - { - using ReductionOpT = sycl::multiplies; - return dpctl::tensor::kernels:: - reduction_over_group_with_atomics_strided_impl; - } - else { - return nullptr; - } - } -}; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class first_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class first_custom_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + }); -template -struct ProductOverAxisTempsStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForProductReductionTemps< - srcTy, dstTy>::is_defined) - { - using ReductionOpT = sycl::multiplies; - return dpctl::tensor::kernels:: - reduction_over_group_temps_strided_impl; - } - else { - return nullptr; - } - } -}; + size_t remaining_reduction_nelems = reduction_groups; -template -struct ProductOverAxis1AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForProductReductionAtomic< - srcTy, dstTy>::is_defined) - { - using ReductionOpT = sycl::multiplies; - return dpctl::tensor::kernels:: - reduction_axis1_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - return nullptr; - } - } -}; + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; -template -struct ProductOverAxis0AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForProductReductionAtomic< - srcTy, dstTy>::is_defined) - { - using ReductionOpT = sycl::multiplies; - return dpctl::tensor::kernels:: - reduction_axis0_over_group_with_atomics_contig_impl< - srcTy, dstTy, ReductionOpT>; - } - else { - return nullptr; - } - } -}; + while (remaining_reduction_nelems > + preferrered_reductions_per_wi * max_wg) { + size_t reduction_groups_ = + (remaining_reduction_nelems + + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups_ > 1); -// Argmax and Argmin + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); -/* = Search reduction using reduce_over_group*/ + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; -template -struct SearchReduction -{ -private: - const argT *inp_ = nullptr; - argT *vals_ = nullptr; - const outT *inds_ = nullptr; - outT *out_ = nullptr; - ReductionOp reduction_op_; - argT identity_; - IdxReductionOp idx_reduction_op_; - outT idx_identity_; - InputOutputIterIndexerT inp_out_iter_indexer_; - InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; -public: - SearchReduction(const argT *data, - argT *vals, - const outT *inds, - outT *res, - ReductionOp reduction_op, - const argT &identity_val, - IdxReductionOp idx_reduction_op, - const outT &idx_identity_val, - InputOutputIterIndexerT arg_res_iter_indexer, - InputRedIndexerT arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) - : inp_(data), vals_(vals), inds_(inds), out_(res), - reduction_op_(reduction_op), identity_(identity_val), - idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), - inp_out_iter_indexer_(arg_res_iter_indexer), - inp_reduced_dims_indexer_(arg_reduced_dims_indexer), - reduction_max_gid_(reduction_size), iter_gws_(iteration_size), - reductions_per_wi(reduction_size_per_wi) - { - } + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; - void operator()(sycl::nd_item<1> it) const - { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class middle_reduction_axis0_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + remaining_reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class middle_custom_reduction_axis0_temps_contig_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + local_memory, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } - // work-items operates over input with indices - // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg - // + reduction_lid - // for 0 <= m < reductions_per_wi + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); - auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); - const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - argT local_red_val(identity_); - outT local_idx(idx_identity_); - size_t arg_reduce_gid0 = - reduction_lid + reduction_batch_id * wg * reductions_per_wi; - for (size_t m = 0; m < reductions_per_wi; ++m) { - size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{}; - if (arg_reduce_gid < reduction_max_gid_) { - auto inp_reduction_offset = - inp_reduced_dims_indexer_(arg_reduce_gid); - auto inp_offset = inp_iter_offset + inp_reduction_offset; + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; - argT val = inp_[inp_offset]; - if (val == local_red_val) { - if constexpr (!First) { - local_idx = - idx_reduction_op_(local_idx, inds_[inp_offset]); - } - else { - local_idx = idx_reduction_op_( - local_idx, static_cast(arg_reduce_gid)); - } - } - else { - if constexpr (su_ns::IsMinimum::value) { - if (val < local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = static_cast(arg_reduce_gid); - } - } - } - else if constexpr (su_ns::IsMaximum::value) { - if (val > local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = static_cast(arg_reduce_gid); - } - } - } - } - } - } + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); - auto work_group = it.get_group(); - // This only works if reduction_op_ is from small set of operators - argT red_val_over_wg = sycl::reduce_over_group( - work_group, local_red_val, identity_, reduction_op_); + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); - if constexpr (std::is_integral_v) { - local_idx = - (red_val_over_wg == local_red_val) ? local_idx : idx_identity_; - } - else { - local_idx = - (red_val_over_wg == local_red_val || - std::isnan(red_val_over_wg) || std::isnan(local_red_val)) - ? local_idx - : idx_identity_; - } - outT idx_over_wg = sycl::reduce_over_group( - work_group, local_idx, idx_identity_, idx_reduction_op_); + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; - if (work_group.leader()) { - // each group writes to a different memory location - if constexpr (!Last) { - // if not the final reduction, write value corresponding to - // an index to a temporary - vals_[out_iter_offset * n_reduction_groups + - reduction_batch_id] = red_val_over_wg; + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class final_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, + remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); } - out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = - idx_over_wg; - } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class final_custom_reduction_axis0_temps_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + const sycl::context &ctx = exec_q.get_context(); + + cgh.host_task([ctx, partially_reduced_tmp] { + sycl::free(partially_reduced_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; } -}; +} -/* = Search reduction using custom_reduce_over_group*/ +/* @brief Types supported by comparison-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForCompReductionAtomic +{ -template -struct CustomSearchReduction + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ + // input int32 + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForCompReductionTemps { -private: - const argT *inp_ = nullptr; - argT *vals_ = nullptr; - const outT *inds_ = nullptr; - outT *out_ = nullptr; - ReductionOp reduction_op_; - argT identity_; - IdxReductionOp idx_reduction_op_; - outT idx_identity_; - InputOutputIterIndexerT inp_out_iter_indexer_; - InputRedIndexerT inp_reduced_dims_indexer_; - SlmT local_mem_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; -public: - CustomSearchReduction(const argT *data, - argT *vals, - outT *inds, - outT *res, - ReductionOp reduction_op, - const argT &identity_val, - IdxReductionOp idx_reduction_op, - const outT &idx_identity_val, - InputOutputIterIndexerT arg_res_iter_indexer, - InputRedIndexerT arg_reduced_dims_indexer, - SlmT local_mem, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) - : inp_(data), vals_(vals), inds_(inds), out_(res), - reduction_op_(reduction_op), identity_(identity_val), - idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), - inp_out_iter_indexer_(arg_res_iter_indexer), - inp_reduced_dims_indexer_(arg_reduced_dims_indexer), - local_mem_(local_mem), reduction_max_gid_(reduction_size), - iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct MaxOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxisTempsStridedFactory +{ + fnT get() const { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } } +}; - void operator()(sycl::nd_item<1> it) const +template +struct MaxOverAxis1AtomicContigFactory +{ + fnT get() const { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; +template +struct MaxOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + } + else { + return nullptr; + } + } +}; + +// Sum + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForSumReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForSumReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct SumOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +// Product + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForProductReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForProductReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ProductOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct TypePairSupportDataForHypotReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct HypotOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct HypotOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForHypotReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::Hypot; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct TypePairSupportDataForLogSumExpReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint64_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct LogSumExpOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct LogSumExpOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForLogSumExpReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = su_ns::LogSumExp; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_temps_contig_impl; + } + else { + return nullptr; + } + } +}; + +// Argmax and Argmin + +/* = Search reduction using reduce_over_group*/ + +template +struct SearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + SearchReduction(const argT *data, + argT *vals, + const outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if constexpr (std::is_integral_v) { + local_idx = + (red_val_over_wg == local_red_val) ? local_idx : idx_identity_; + } + else { + local_idx = + (red_val_over_wg == local_red_val || + std::isnan(red_val_over_wg) || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +/* = Search reduction using custom_reduce_over_group*/ + +template +struct CustomSearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomSearchReduction(const argT *data, + argT *vals, + outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == local_red_val) { + if constexpr (!First) { + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); + } + else { + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); + } + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so + // check + if (less_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v) { + if (val < local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v) { + if (val > local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + // equality does not hold for NaNs, so check here + local_idx = (red_val_over_wg == local_red_val || + std::isnan(std::real(local_red_val)) || + std::isnan(std::imag(local_red_val))) + ? local_idx + : idx_identity_; + } + else if constexpr (std::is_floating_point_v) { + // equality does not hold for NaNs, so check here + local_idx = + (red_val_over_wg == local_red_val || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + else { + local_idx = + red_val_over_wg == local_red_val ? local_idx : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +typedef sycl::event (*search_strided_impl_fn_ptr)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + const std::vector &); + +template +class search_over_group_temps_strided_krn; + +template +class custom_search_over_group_temps_strided_krn; + +template +class single_search_axis0_temps_contig_krn; + +template +class first_search_axis0_temps_contig_krn; + +template +class middle_search_axis0_temps_contig_krn; + +template +class final_search_axis0_temps_contig_krn; + +template +class single_custom_search_axis0_temps_contig_krn; + +template +class first_custom_search_axis0_temps_contig_krn; + +template +class middle_custom_search_axis0_temps_contig_krn; + +template +class final_custom_search_axis0_temps_contig_krn; + +template +class single_search_axis1_temps_contig_krn; + +template +class first_search_axis1_temps_contig_krn; + +template +class middle_search_axis1_temps_contig_krn; + +template +class final_search_axis1_temps_contig_krn; + +template +class single_custom_search_axis1_temps_contig_krn; + +template +class first_custom_search_axis1_temps_contig_krn; + +template +class middle_custom_search_axis1_temps_contig_krn; + +template +class final_custom_search_axis1_temps_contig_krn; + +template +sycl::event search_over_group_temps_strided_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const py::ssize_t *iter_shape_and_strides, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + int red_nd, + const py::ssize_t *reduction_shape_stride, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + constexpr argTy identity_val = su_ns::Identity::value; + constexpr resTy idx_identity_val = su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + constexpr size_t preferrered_reductions_per_wi = 4; + // max_max_wg prevents running out of resources on CPU + size_t max_wg = std::min( + size_t(2048), d.get_info()); + + size_t reductions_per_wi(preferrered_reductions_per_wi); + if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { + // reduction only requries 1 work-group, can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + argTy *partially_reduced_vals_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + argTy *partially_reduced_vals_tmp2 = nullptr; + + if (partially_reduced_vals_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + } + + sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of iterated + // dimensions of input array from iter_shape_and_strides are going + // to be accessed by inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferrered_reductions_per_wi * max_wg) { + size_t reduction_groups_ = + (remaining_reduction_nelems + + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, + false, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /* strides */ iter_shape_and_strides + + 2 * iter_nd}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_search_over_group_temps_strided_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + sycl::context ctx = exec_q.get_context(); + + cgh.host_task( + [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { + sycl::free(partially_reduced_tmp, ctx); + sycl::free(partially_reduced_vals_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +typedef sycl::event (*search_contig_impl_fn_ptr)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +template +sycl::event search_axis1_over_group_temps_contig_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + constexpr argTy identity_val = su_ns::Identity::value; + constexpr resTy idx_identity_val = su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + constexpr size_t preferrered_reductions_per_wi = 8; + // max_max_wg prevents running out of resources on CPU + size_t max_wg = std::min( + size_t(2048), d.get_info()); + + size_t reductions_per_wi(preferrered_reductions_per_wi); + if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { + // reduction only requries 1 work-group, can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class single_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class single_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + argTy *partially_reduced_vals_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + argTy *partially_reduced_vals_tmp2 = nullptr; + + if (partially_reduced_vals_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + } + + sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class first_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class first_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferrered_reductions_per_wi * max_wg) { + size_t reduction_groups_ = + (remaining_reduction_nelems + + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups_ > 1); - // work-items operates over input with indices - // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg - // + reduction_lid - // for 0 <= m < reductions_per_wi + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); - auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); - const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; - argT local_red_val(identity_); - outT local_idx(idx_identity_); - size_t arg_reduce_gid0 = - reduction_lid + reduction_batch_id * wg * reductions_per_wi; - for (size_t m = 0; m < reductions_per_wi; ++m) { - size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; - if (arg_reduce_gid < reduction_max_gid_) { - auto inp_reduction_offset = - inp_reduced_dims_indexer_(arg_reduce_gid); - auto inp_offset = inp_iter_offset + inp_reduction_offset; + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; - argT val = inp_[inp_offset]; - if (val == local_red_val) { - if constexpr (!First) { - local_idx = - idx_reduction_op_(local_idx, inds_[inp_offset]); - } - else { - local_idx = idx_reduction_op_( - local_idx, static_cast(arg_reduce_gid)); - } + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (can_use_reduce_over_group::value) { + using KernelName = + class middle_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); } else { - if constexpr (su_ns::IsMinimum::value) { - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - using dpctl::tensor::math_utils::less_complex; - // less_complex always returns false for NaNs, so - // check - if (less_complex(val, local_red_val) || - std::isnan(std::real(val)) || - std::isnan(std::imag(val))) - { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = - static_cast(arg_reduce_gid); - } - } - } - else if constexpr (std::is_floating_point_v) { - if (val < local_red_val || std::isnan(val)) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = - static_cast(arg_reduce_gid); - } - } - } - else { - if (val < local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = - static_cast(arg_reduce_gid); - } - } - } - } - else if constexpr (su_ns::IsMaximum::value) { - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - using dpctl::tensor::math_utils::greater_complex; - if (greater_complex(val, local_red_val) || - std::isnan(std::real(val)) || - std::isnan(std::imag(val))) - { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = - static_cast(arg_reduce_gid); - } - } - } - else if constexpr (std::is_floating_point_v) { - if (val > local_red_val || std::isnan(val)) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = - static_cast(arg_reduce_gid); - } - } - } - else { - if (val > local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = - static_cast(arg_reduce_gid); - } - } - } - } + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class middle_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, + false, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); } - } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; } - auto work_group = it.get_group(); - // This only works if reduction_op_ is from small set of operators - argT red_val_over_wg = su_ns::custom_reduce_over_group( - work_group, local_mem_, local_red_val, reduction_op_); + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - // equality does not hold for NaNs, so check here - local_idx = (red_val_over_wg == local_red_val || - std::isnan(std::real(local_red_val)) || - std::isnan(std::imag(local_red_val))) - ? local_idx - : idx_identity_; - } - else if constexpr (std::is_floating_point_v) { - // equality does not hold for NaNs, so check here - local_idx = - (red_val_over_wg == local_red_val || std::isnan(local_red_val)) - ? local_idx - : idx_identity_; - } - else { - local_idx = - red_val_over_wg == local_red_val ? local_idx : idx_identity_; - } - outT idx_over_wg = sycl::reduce_over_group( - work_group, local_idx, idx_identity_, idx_reduction_op_); - if (work_group.leader()) { - // each group writes to a different memory location - if constexpr (!Last) { - // if not the final reduction, write value corresponding to - // an index to a temporary - vals_[out_iter_offset * n_reduction_groups + - reduction_batch_id] = red_val_over_wg; + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class final_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); } - out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = - idx_over_wg; - } - } -}; + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class final_custom_search_axis1_temps_contig_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); -typedef sycl::event (*search_reduction_strided_impl_fn_ptr)( - sycl::queue, - size_t, - size_t, - const char *, - char *, - int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, - int, - const py::ssize_t *, - py::ssize_t, - const std::vector &); + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + sycl::context ctx = exec_q.get_context(); -template -class search_reduction_over_group_temps_krn; + cgh.host_task( + [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { + sycl::free(partially_reduced_tmp, ctx); + sycl::free(partially_reduced_vals_tmp, ctx); + }); + }); -template -class search_custom_reduction_over_group_temps_krn; + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list -using dpctl::tensor::sycl_utils::choose_workgroup_size; + return cleanup_host_task_event; + } +} template -sycl::event search_reduction_over_group_temps_strided_impl( +sycl::event search_axis0_over_group_temps_contig_impl( sycl::queue exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -2525,12 +4630,8 @@ sycl::event search_reduction_over_group_temps_strided_impl( // number of columns) const char *arg_cp, char *res_cp, - int iter_nd, - const py::ssize_t *iter_shape_and_strides, py::ssize_t iter_arg_offset, py::ssize_t iter_res_offset, - int red_nd, - const py::ssize_t *reduction_shape_stride, py::ssize_t reduction_arg_offset, const std::vector &depends) { @@ -2544,7 +4645,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 4; + constexpr size_t preferrered_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU size_t max_wg = std::min( size_t(2048), d.get_info()); @@ -2555,16 +4656,20 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; - InputOutputIterIndexerT in_out_iter_indexer{ - iter_nd, iter_arg_offset, iter_res_offset, - iter_shape_and_strides}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; wg = max_wg; reductions_per_wi = @@ -2581,7 +4686,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( if constexpr (can_use_reduce_over_group::value) { - using KernelName = class search_reduction_over_group_temps_krn< + using KernelName = class single_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, true, true>; cgh.parallel_for( @@ -2598,7 +4703,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); using KernelName = - class search_custom_reduction_over_group_temps_krn< + class single_custom_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, true>; @@ -2655,25 +4760,20 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; - using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; using InputOutputIterIndexerT = dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; - - // Only 2*iter_nd entries describing shape and strides of iterated - // dimensions of input array from iter_shape_and_strides are going - // to be accessed by inp_indexer - InputIndexerT inp_indexer(iter_nd, iter_arg_offset, - iter_shape_and_strides); - ResIndexerT noop_tmp_indexer{}; + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; - InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, - noop_tmp_indexer}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; @@ -2681,7 +4781,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( if constexpr (can_use_reduce_over_group::value) { - using KernelName = class search_reduction_over_group_temps_krn< + using KernelName = class first_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, true, false>; cgh.parallel_for( @@ -2699,7 +4799,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); using KernelName = - class search_custom_reduction_over_group_temps_krn< + class first_custom_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, false>; @@ -2763,7 +4863,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( if constexpr (can_use_reduce_over_group::value) { using KernelName = - class search_reduction_over_group_temps_krn< + class middle_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, false, false>; @@ -2782,7 +4882,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); using KernelName = - class search_custom_reduction_over_group_temps_krn< + class middle_custom_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, false>; @@ -2812,8 +4912,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( cgh.depends_on(dependent_ev); using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - using ResIndexerT = - dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; using InputOutputIterIndexerT = dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< InputIndexerT, ResIndexerT>; @@ -2822,10 +4921,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( InputIndexerT inp_indexer{ 0, static_cast(iter_nelems), static_cast(remaining_reduction_nelems)}; - ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, - /* shape */ iter_shape_and_strides, - /*s trides */ iter_shape_and_strides + - 2 * iter_nd}; + ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; @@ -2846,7 +4942,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( if constexpr (can_use_reduce_over_group::value) { - using KernelName = class search_reduction_over_group_temps_krn< + using KernelName = class final_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, false, true>; cgh.parallel_for( @@ -2864,7 +4960,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( using SlmT = sycl::local_accessor; SlmT local_memory = SlmT(localRange, cgh); using KernelName = - class search_custom_reduction_over_group_temps_krn< + class final_custom_search_axis0_temps_contig_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, true>; @@ -2971,7 +5067,75 @@ struct ArgmaxOverAxisTempsStridedFactory // op for indices using IndexOpT = sycl::minimum; return dpctl::tensor::kernels:: - search_reduction_over_group_temps_strided_impl< + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgmaxOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgmaxOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< srcTy, dstTy, ReductionOpT, IndexOpT>; } else { @@ -2980,7 +5144,7 @@ struct ArgmaxOverAxisTempsStridedFactory // op for indices using IndexOpT = sycl::minimum; return dpctl::tensor::kernels:: - search_reduction_over_group_temps_strided_impl< + search_axis0_over_group_temps_contig_impl< srcTy, dstTy, ReductionOpT, IndexOpT>; } } @@ -3005,7 +5169,75 @@ struct ArgminOverAxisTempsStridedFactory // op for indices using IndexOpT = sycl::minimum; return dpctl::tensor::kernels:: - search_reduction_over_group_temps_strided_impl< + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis1TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis1_over_group_temps_contig_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxis0TempsContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_axis0_over_group_temps_contig_impl< srcTy, dstTy, ReductionOpT, IndexOpT>; } else { @@ -3014,7 +5246,7 @@ struct ArgminOverAxisTempsStridedFactory // op for indices using IndexOpT = sycl::minimum; return dpctl::tensor::kernels:: - search_reduction_over_group_temps_strided_impl< + search_axis0_over_group_temps_contig_impl< srcTy, dstTy, ReductionOpT, IndexOpT>; } } diff --git a/dpctl/tensor/libtensor/include/utils/math_utils.hpp b/dpctl/tensor/libtensor/include/utils/math_utils.hpp index d724e03e35..120a14d536 100644 --- a/dpctl/tensor/libtensor/include/utils/math_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/math_utils.hpp @@ -115,6 +115,26 @@ template T min_complex(const T &x1, const T &x2) return (std::isnan(real1) || isnan_imag1 || lt) ? x1 : x2; } +template T logaddexp(T x, T y) +{ + if (x == y) { // handle signed infinities + const T log2 = std::log(T(2)); + return x + log2; + } + else { + const T tmp = x - y; + if (tmp > 0) { + return x + std::log1p(std::exp(-tmp)); + } + else if (tmp <= 0) { + return y + std::log1p(std::exp(tmp)); + } + else { + return std::numeric_limits::quiet_NaN(); + } + } +} + } // namespace math_utils } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 0d4240c516..c0165b0ecc 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -286,6 +286,46 @@ struct GetIdentity::value>> static constexpr T value = static_cast(1); }; +// LogSumExp + +template struct LogSumExp +{ + T operator()(const T &x, const T &y) const + { + using dpctl::tensor::math_utils::logaddexp; + return logaddexp(x, y); + } +}; + +template +using IsLogSumExp = std::bool_constant>>; + +// only defined for types with infinity +template +struct GetIdentity::value>> +{ + static constexpr T value = -std::numeric_limits::infinity(); +}; + +// Hypot + +template struct Hypot +{ + T operator()(const T &x, const T &y) const + { + return sycl::hypot(x, y); + } +}; + +template +using IsHypot = std::bool_constant>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = 0; +}; + // Identity template struct Identity diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp deleted file mode 100644 index c67fcd5ba3..0000000000 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp +++ /dev/null @@ -1,514 +0,0 @@ -//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2023 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#include -#include -#include -#include - -#include -#include -#include - -#include "dpctl4pybind11.hpp" -#include "kernels/reductions.hpp" -#include "reduction_over_axis.hpp" -#include "simplify_iteration_space.hpp" -#include "utils/type_dispatch.hpp" - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -namespace td_ns = dpctl::tensor::type_dispatch; -// Max -namespace impl -{ - -using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; -static reduction_strided_impl_fn_ptr - max_over_axis_strided_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_strided_impl_fn_ptr - max_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; -static reduction_contig_impl_fn_ptr - max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_contig_impl_fn_ptr - max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_max_over_axis_dispatch_tables(void) -{ - using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; - using td_ns::DispatchTableBuilder; - - using dpctl::tensor::kernels::MaxOverAxisAtomicStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table); - - using dpctl::tensor::kernels::MaxOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table); - - using dpctl::tensor::kernels::MaxOverAxis1AtomicContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table); - - using dpctl::tensor::kernels::MaxOverAxis0AtomicContigFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table); -} - -} // namespace impl - -// Min -namespace impl -{ - -using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; -static reduction_strided_impl_fn_ptr - min_over_axis_strided_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_strided_impl_fn_ptr - min_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; -static reduction_contig_impl_fn_ptr - min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_contig_impl_fn_ptr - min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_min_over_axis_dispatch_tables(void) -{ - using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; - using td_ns::DispatchTableBuilder; - - using dpctl::tensor::kernels::MinOverAxisAtomicStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table); - - using dpctl::tensor::kernels::MinOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table); - - using dpctl::tensor::kernels::MinOverAxis1AtomicContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table); - - using dpctl::tensor::kernels::MinOverAxis0AtomicContigFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table); -} - -} // namespace impl - -// Sum -namespace impl -{ - -using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; -static reduction_strided_impl_fn_ptr - sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_strided_impl_fn_ptr - sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; -static reduction_contig_impl_fn_ptr - sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_contig_impl_fn_ptr - sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_sum_over_axis_dispatch_tables(void) -{ - using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; - using namespace td_ns; - - using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); -} - -} // namespace impl - -// Product -namespace impl -{ - -using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; -static reduction_strided_impl_fn_ptr - prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_strided_impl_fn_ptr - prod_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; -static reduction_contig_impl_fn_ptr - prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static reduction_contig_impl_fn_ptr - prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_prod_over_axis_dispatch_tables(void) -{ - using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; - using namespace td_ns; - - using dpctl::tensor::kernels::ProductOverAxisAtomicStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table); - - using dpctl::tensor::kernels::ProductOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table); - - using dpctl::tensor::kernels::ProductOverAxis1AtomicContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table); - - using dpctl::tensor::kernels::ProductOverAxis0AtomicContigFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table); -} - -} // namespace impl - -// Argmax -namespace impl -{ - -using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; -static search_reduction_strided_impl_fn_ptr - argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_argmax_over_axis_dispatch_tables(void) -{ - using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; - using td_ns::DispatchTableBuilder; - - using dpctl::tensor::kernels::ArgmaxOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table); -} - -} // namespace impl - -// Argmin -namespace impl -{ - -using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; -static search_reduction_strided_impl_fn_ptr - argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -void populate_argmin_over_axis_dispatch_tables(void) -{ - using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; - using td_ns::DispatchTableBuilder; - - using dpctl::tensor::kernels::ArgminOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table); -} - -} // namespace impl - -namespace py = pybind11; - -void init_reduction_functions(py::module_ m) -{ - using arrayT = dpctl::tensor::usm_ndarray; - using event_vecT = std::vector; - - namespace impl = dpctl::tensor::py_internal::impl; - - using dpctl::tensor::py_internal::py_reduction_dtype_supported; - using dpctl::tensor::py_internal::py_reduction_over_axis; - - using dpctl::tensor::py_internal::check_atomic_support; - using dpctl::tensor::py_internal::fixed_decision; - - // MAX - { - using dpctl::tensor::py_internal::impl:: - populate_max_over_axis_dispatch_tables; - populate_max_over_axis_dispatch_tables(); - using impl::max_over_axis0_contig_atomic_dispatch_table; - using impl::max_over_axis1_contig_atomic_dispatch_table; - using impl::max_over_axis_strided_atomic_dispatch_table; - using impl::max_over_axis_strided_temps_dispatch_table; - - const auto &check_atomic_support_size4 = - check_atomic_support; - const auto &check_atomic_support_size8 = - check_atomic_support; - - auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, - const arrayT &dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_reduction_over_axis( - src, trailing_dims_to_reduce, dst, exec_q, depends, - max_over_axis_strided_atomic_dispatch_table, - max_over_axis_strided_temps_dispatch_table, - max_over_axis0_contig_atomic_dispatch_table, - max_over_axis1_contig_atomic_dispatch_table, - check_atomic_support_size4, check_atomic_support_size8); - }; - m.def("_max_over_axis", max_pyapi, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - } - - // MIN - { - using dpctl::tensor::py_internal::impl:: - populate_min_over_axis_dispatch_tables; - populate_min_over_axis_dispatch_tables(); - using impl::min_over_axis0_contig_atomic_dispatch_table; - using impl::min_over_axis1_contig_atomic_dispatch_table; - using impl::min_over_axis_strided_atomic_dispatch_table; - using impl::min_over_axis_strided_temps_dispatch_table; - - const auto &check_atomic_support_size4 = - check_atomic_support; - const auto &check_atomic_support_size8 = - check_atomic_support; - - auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, - const arrayT &dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_reduction_over_axis( - src, trailing_dims_to_reduce, dst, exec_q, depends, - min_over_axis_strided_atomic_dispatch_table, - min_over_axis_strided_temps_dispatch_table, - min_over_axis0_contig_atomic_dispatch_table, - min_over_axis1_contig_atomic_dispatch_table, - check_atomic_support_size4, check_atomic_support_size8); - }; - m.def("_min_over_axis", min_pyapi, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - } - - // SUM - { - using dpctl::tensor::py_internal::impl:: - populate_sum_over_axis_dispatch_tables; - populate_sum_over_axis_dispatch_tables(); - using impl::sum_over_axis0_contig_atomic_dispatch_table; - using impl::sum_over_axis1_contig_atomic_dispatch_table; - using impl::sum_over_axis_strided_atomic_dispatch_table; - using impl::sum_over_axis_strided_temps_dispatch_table; - - const auto &check_atomic_support_size4 = - check_atomic_support; - const auto &check_atomic_support_size8 = - check_atomic_support; - - auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, - const arrayT &dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_reduction_over_axis( - src, trailing_dims_to_reduce, dst, exec_q, depends, - sum_over_axis_strided_atomic_dispatch_table, - sum_over_axis_strided_temps_dispatch_table, - sum_over_axis0_contig_atomic_dispatch_table, - sum_over_axis1_contig_atomic_dispatch_table, - check_atomic_support_size4, check_atomic_support_size8); - }; - m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto sum_dtype_supported = - [&](const py::dtype &input_dtype, const py::dtype &output_dtype, - const std::string &dst_usm_type, sycl::queue &q) { - return py_reduction_dtype_supported( - input_dtype, output_dtype, dst_usm_type, q, - sum_over_axis_strided_atomic_dispatch_table, - sum_over_axis_strided_temps_dispatch_table, - check_atomic_support_size4, check_atomic_support_size8); - }; - m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", - py::arg("arg_dtype"), py::arg("out_dtype"), - py::arg("dst_usm_type"), py::arg("sycl_queue")); - } - - // PROD - { - using dpctl::tensor::py_internal::impl:: - populate_prod_over_axis_dispatch_tables; - populate_prod_over_axis_dispatch_tables(); - using impl::prod_over_axis0_contig_atomic_dispatch_table; - using impl::prod_over_axis1_contig_atomic_dispatch_table; - using impl::prod_over_axis_strided_atomic_dispatch_table; - using impl::prod_over_axis_strided_temps_dispatch_table; - - const auto &check_atomic_support_size4 = - check_atomic_support; - const auto &check_atomic_support_size8 = - check_atomic_support; - - auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, - const arrayT &dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - return py_reduction_over_axis( - src, trailing_dims_to_reduce, dst, exec_q, depends, - prod_over_axis_strided_atomic_dispatch_table, - prod_over_axis_strided_temps_dispatch_table, - prod_over_axis0_contig_atomic_dispatch_table, - prod_over_axis1_contig_atomic_dispatch_table, - check_atomic_support_size4, check_atomic_support_size8); - }; - m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - auto prod_dtype_supported = - [&](const py::dtype &input_dtype, const py::dtype &output_dtype, - const std::string &dst_usm_type, sycl::queue &q) { - return py_reduction_dtype_supported( - input_dtype, output_dtype, dst_usm_type, q, - prod_over_axis_strided_atomic_dispatch_table, - prod_over_axis_strided_temps_dispatch_table, - check_atomic_support_size4, check_atomic_support_size8); - }; - m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", - py::arg("arg_dtype"), py::arg("out_dtype"), - py::arg("dst_usm_type"), py::arg("sycl_queue")); - } - - // ARGMAX - { - using dpctl::tensor::py_internal::impl:: - populate_argmax_over_axis_dispatch_tables; - populate_argmax_over_axis_dispatch_tables(); - using impl::argmax_over_axis_strided_temps_dispatch_table; - - auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, - const arrayT &dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_search_over_axis; - return py_search_over_axis( - src, trailing_dims_to_reduce, dst, exec_q, depends, - argmax_over_axis_strided_temps_dispatch_table); - }; - m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - } - - // ARGMIN - { - using dpctl::tensor::py_internal::impl:: - populate_argmin_over_axis_dispatch_tables; - populate_argmin_over_axis_dispatch_tables(); - using impl::argmin_over_axis_strided_temps_dispatch_table; - - auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, - const arrayT &dst, sycl::queue &exec_q, - const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_search_over_axis; - return py_search_over_axis( - src, trailing_dims_to_reduce, dst, exec_q, depends, - argmin_over_axis_strided_temps_dispatch_table); - }; - m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - } -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp deleted file mode 100644 index 1a9cb6f5e7..0000000000 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp +++ /dev/null @@ -1,689 +0,0 @@ -//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2023 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions, -/// specifically functions for reductions. -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "dpctl4pybind11.hpp" -#include -#include -#include - -#include "kernels/reductions.hpp" -#include "simplify_iteration_space.hpp" -#include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -template -bool check_atomic_support(const sycl::queue &exec_q, - sycl::usm::alloc usm_alloc_type) -{ - bool supports_atomics = false; - - const sycl::device &dev = exec_q.get_device(); - - if constexpr (require_atomic64) { - if (!dev.has(sycl::aspect::atomic64)) - return false; - } - - switch (usm_alloc_type) { - case sycl::usm::alloc::shared: - supports_atomics = dev.has(sycl::aspect::usm_atomic_shared_allocations); - break; - case sycl::usm::alloc::host: - supports_atomics = dev.has(sycl::aspect::usm_atomic_host_allocations); - break; - case sycl::usm::alloc::device: - supports_atomics = true; - break; - default: - supports_atomics = false; - } - - return supports_atomics; -} - -template -bool fixed_decision(const sycl::queue &, sycl::usm::alloc) -{ - return return_value; -} - -/* ====================== dtype supported ======================== */ - -template -bool py_reduction_dtype_supported( - const py::dtype &input_dtype, - const py::dtype &output_dtype, - const std::string &dst_usm_type, - sycl::queue &q, - const fnT &atomic_dispatch_table, - const fnT &temps_dispatch_table, - const CheckAtomicSupportFnT &check_atomic_support_size4, - const CheckAtomicSupportFnT &check_atomic_support_size8) -{ - int arg_tn = - input_dtype.num(); // NumPy type numbers are the same as in dpctl - int out_tn = - output_dtype.num(); // NumPy type numbers are the same as in dpctl - int arg_typeid = -1; - int out_typeid = -1; - - auto array_types = td_ns::usm_ndarray_types(); - - try { - arg_typeid = array_types.typenum_to_lookup_id(arg_tn); - out_typeid = array_types.typenum_to_lookup_id(out_tn); - } catch (const std::exception &e) { - throw py::value_error(e.what()); - } - - if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || - out_typeid >= td_ns::num_types) - { - throw std::runtime_error("Reduction type support check: lookup failed"); - } - - // remove_all_extents gets underlying type of table - using fn_ptrT = typename std::remove_all_extents::type; - fn_ptrT fn = nullptr; - - sycl::usm::alloc kind = sycl::usm::alloc::unknown; - - if (dst_usm_type == "device") { - kind = sycl::usm::alloc::device; - } - else if (dst_usm_type == "shared") { - kind = sycl::usm::alloc::shared; - } - else if (dst_usm_type == "host") { - kind = sycl::usm::alloc::host; - } - else { - throw py::value_error("Unrecognized `dst_usm_type` argument."); - } - - bool supports_atomics = false; - - switch (output_dtype.itemsize()) { - case sizeof(float): - { - supports_atomics = check_atomic_support_size4(q, kind); - } break; - case sizeof(double): - { - supports_atomics = check_atomic_support_size8(q, kind); - } break; - } - - if (supports_atomics) { - fn = atomic_dispatch_table[arg_typeid][out_typeid]; - } - - if (fn == nullptr) { - // use slower reduction implementation using temporaries - fn = temps_dispatch_table[arg_typeid][out_typeid]; - } - - return (fn != nullptr); -} - -/* ==================== Generic reductions ====================== */ - -template -std::pair py_reduction_over_axis( - const dpctl::tensor::usm_ndarray &src, - int trailing_dims_to_reduce, // comp over this many trailing indexes - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends, - const strided_fnT &atomic_dispatch_table, - const strided_fnT &temps_dispatch_table, - const contig_fnT &axis0_dispatch_table, - const contig_fnT &axis1_dispatch_table, - const SupportAtomicFnT &check_atomic_support_size4, - const SupportAtomicFnT &check_atomic_support_size8) -{ - int src_nd = src.get_ndim(); - int iteration_nd = src_nd - trailing_dims_to_reduce; - if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { - throw py::value_error("Trailing_dim_to_reduce must be positive, but no " - "greater than rank of the array being reduced"); - } - - int dst_nd = dst.get_ndim(); - if (dst_nd != iteration_nd) { - throw py::value_error("Destination array rank does not match input " - "array rank and number of reduced dimensions"); - } - - const py::ssize_t *src_shape_ptr = src.get_shape_raw(); - const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); - - bool same_shapes = true; - for (int i = 0; same_shapes && (i < dst_nd); ++i) { - same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); - } - - if (!same_shapes) { - throw py::value_error("Destination shape does not match unreduced " - "dimensions of the input shape"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - size_t dst_nelems = dst.get_size(); - - size_t reduction_nelems(1); - for (int i = dst_nd; i < src_nd; ++i) { - reduction_nelems *= static_cast(src_shape_ptr[i]); - } - - // check that dst and src do not overlap - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(src, dst)) { - throw py::value_error("Arrays index overlapping segments of memory"); - } - - // destination must be ample enough to accommodate all elements - { - auto dst_offsets = dst.get_minmax_offsets(); - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if (range + 1 < dst_nelems) { - throw py::value_error( - "Destination array can not accommodate all the " - "elements of source array."); - } - } - - int src_typenum = src.get_typenum(); - int dst_typenum = dst.get_typenum(); - - namespace td_ns = dpctl::tensor::type_dispatch; - const auto &array_types = td_ns::usm_ndarray_types(); - int src_typeid = array_types.typenum_to_lookup_id(src_typenum); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - int dst_itemsize = dst.get_elemsize(); - bool supports_atomics = false; - - switch (dst_itemsize) { - case sizeof(float): - { - void *data_ptr = dst.get_data(); - const auto &ctx = exec_q.get_context(); - auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - supports_atomics = check_atomic_support_size4(exec_q, usm_type); - } break; - case sizeof(double): - { - void *data_ptr = dst.get_data(); - const auto &ctx = exec_q.get_context(); - auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - - supports_atomics = check_atomic_support_size8(exec_q, usm_type); - } break; - } - - // handle special case when both reduction and iteration are 1D contiguous - // and can be done with atomics - if (supports_atomics) { - bool is_src_c_contig = src.is_c_contiguous(); - bool is_dst_c_contig = dst.is_c_contiguous(); - bool is_src_f_contig = src.is_f_contiguous(); - - if ((is_src_c_contig && is_dst_c_contig) || - (is_src_f_contig && dst_nelems == 1)) - { - auto fn = axis1_dispatch_table[src_typeid][dst_typeid]; - - if (fn != nullptr) { - size_t iter_nelems = dst_nelems; - - constexpr py::ssize_t zero_offset = 0; - - sycl::event reduction_over_axis_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), - zero_offset, // iteration_src_offset - zero_offset, // iteration_dst_offset - zero_offset, // reduction_src_offset - depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {reduction_over_axis_contig_ev}); - - return std::make_pair(keep_args_event, - reduction_over_axis_contig_ev); - } - } - else if (is_src_f_contig && - ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) - { - auto fn = axis0_dispatch_table[src_typeid][dst_typeid]; - if (fn != nullptr) { - size_t iter_nelems = dst_nelems; - - constexpr py::ssize_t zero_offset = 0; - - sycl::event reduction_over_axis_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), - zero_offset, // iteration_src_offset - zero_offset, // iteration_dst_offset - zero_offset, // reduction_src_offset - depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {reduction_over_axis_contig_ev}); - - return std::make_pair(keep_args_event, - reduction_over_axis_contig_ev); - } - } - } - - using dpctl::tensor::py_internal::simplify_iteration_space; - using dpctl::tensor::py_internal::simplify_iteration_space_1; - - auto const &src_shape_vecs = src.get_shape_vector(); - auto const &src_strides_vecs = src.get_strides_vector(); - auto const &dst_strides_vecs = dst.get_strides_vector(); - - int reduction_nd = trailing_dims_to_reduce; - const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; - using shT = std::vector; - shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, - std::end(src_strides_vecs)); - - shT simplified_reduction_shape; - shT simplified_reduction_src_strides; - py::ssize_t reduction_src_offset(0); - - simplify_iteration_space_1( - reduction_nd, reduction_shape_ptr, reduction_src_strides, - // output - simplified_reduction_shape, simplified_reduction_src_strides, - reduction_src_offset); - - const py::ssize_t *iteration_shape_ptr = src_shape_ptr; - - shT iteration_src_strides(std::begin(src_strides_vecs), - std::begin(src_strides_vecs) + iteration_nd); - shT const &iteration_dst_strides = dst_strides_vecs; - - shT simplified_iteration_shape; - shT simplified_iteration_src_strides; - shT simplified_iteration_dst_strides; - py::ssize_t iteration_src_offset(0); - py::ssize_t iteration_dst_offset(0); - - if (iteration_nd == 0) { - if (dst_nelems != 1) { - throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); - } - iteration_nd = 1; - simplified_iteration_shape.push_back(1); - simplified_iteration_src_strides.push_back(0); - simplified_iteration_dst_strides.push_back(0); - } - else { - simplify_iteration_space(iteration_nd, iteration_shape_ptr, - iteration_src_strides, iteration_dst_strides, - // output - simplified_iteration_shape, - simplified_iteration_src_strides, - simplified_iteration_dst_strides, - iteration_src_offset, iteration_dst_offset); - } - - if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) { - bool mat_reduce_over_axis1 = false; - bool mat_reduce_over_axis0 = false; - bool array_reduce_all_elems = false; - size_t iter_nelems = dst_nelems; - - if (simplified_reduction_src_strides[0] == 1) { - array_reduce_all_elems = (simplified_iteration_shape[0] == 1); - mat_reduce_over_axis1 = - (simplified_iteration_dst_strides[0] == 1) && - (static_cast(simplified_iteration_src_strides[0]) == - reduction_nelems); - } - else if (static_cast(simplified_reduction_src_strides[0]) == - iter_nelems) - { - mat_reduce_over_axis0 = - (simplified_iteration_dst_strides[0] == 1) && - (simplified_iteration_src_strides[0] == 1); - } - - if (mat_reduce_over_axis1 || array_reduce_all_elems) { - auto fn = axis1_dispatch_table[src_typeid][dst_typeid]; - if (fn != nullptr) { - sycl::event reduction_over_axis1_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_src_offset, - iteration_dst_offset, reduction_src_offset, depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); - - return std::make_pair(keep_args_event, - reduction_over_axis1_contig_ev); - } - } - else if (mat_reduce_over_axis0) { - auto fn = axis0_dispatch_table[src_typeid][dst_typeid]; - if (fn != nullptr) { - sycl::event reduction_over_axis0_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_src_offset, - iteration_dst_offset, reduction_src_offset, depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); - - return std::make_pair(keep_args_event, - reduction_over_axis0_contig_ev); - } - } - } - - // remove_all_extents gets underlying type of table - using strided_fn_ptr_T = - typename std::remove_all_extents::type; - strided_fn_ptr_T fn = nullptr; - - if (supports_atomics) { - fn = atomic_dispatch_table[src_typeid][dst_typeid]; - } - - if (fn == nullptr) { - // use slower reduction implementation using temporaries - fn = temps_dispatch_table[src_typeid][dst_typeid]; - if (fn == nullptr) { - throw std::runtime_error("Datatypes are not supported"); - } - } - - std::vector host_task_events{}; - - using dpctl::tensor::offset_utils::device_allocate_and_pack; - - const auto &arrays_metainfo_packing_triple_ = - device_allocate_and_pack( - exec_q, host_task_events, - // iteration metadata - simplified_iteration_shape, simplified_iteration_src_strides, - simplified_iteration_dst_strides, - // reduction metadata - simplified_reduction_shape, simplified_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } - const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); - - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = - temp_allocation_ptr + 3 * simplified_iteration_shape.size(); - - std::vector all_deps; - all_deps.reserve(depends.size() + 1); - all_deps.resize(depends.size()); - std::copy(depends.begin(), depends.end(), all_deps.begin()); - all_deps.push_back(copy_metadata_ev); - - auto reduction_ev = - fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), - iteration_nd, iter_shape_and_strides, iteration_src_offset, - iteration_dst_offset, - reduction_nd, // number dimensions being reduced - reduction_shape_stride, reduction_src_offset, all_deps); - - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(reduction_ev); - const auto &ctx = exec_q.get_context(); - cgh.host_task([ctx, temp_allocation_ptr] { - sycl::free(temp_allocation_ptr, ctx); - }); - }); - host_task_events.push_back(temp_cleanup_ev); - - sycl::event keep_args_event = - dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); - - return std::make_pair(keep_args_event, reduction_ev); -} - -/* ==================== Search reductions ====================== */ - -template -std::pair py_search_over_axis( - const dpctl::tensor::usm_ndarray &src, - int trailing_dims_to_reduce, // comp over this many trailing indexes - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends, - const fn_tableT &dispatch_table) -{ - int src_nd = src.get_ndim(); - int iteration_nd = src_nd - trailing_dims_to_reduce; - if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { - throw py::value_error("Trailing_dim_to_reduce must be positive, but no " - "greater than rank of the array being reduced"); - } - - int dst_nd = dst.get_ndim(); - if (dst_nd != iteration_nd) { - throw py::value_error("Destination array rank does not match input " - "array rank and number of reduced dimensions"); - } - - const py::ssize_t *src_shape_ptr = src.get_shape_raw(); - const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); - - bool same_shapes = true; - for (int i = 0; same_shapes && (i < dst_nd); ++i) { - same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); - } - - if (!same_shapes) { - throw py::value_error("Destination shape does not match unreduced " - "dimensions of the input shape"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - size_t dst_nelems = dst.get_size(); - - size_t reduction_nelems(1); - for (int i = dst_nd; i < src_nd; ++i) { - reduction_nelems *= static_cast(src_shape_ptr[i]); - } - - // check that dst and src do not overlap - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(src, dst)) { - throw py::value_error("Arrays index overlapping segments of memory"); - } - - // destination must be ample enough to accommodate all elements - { - auto dst_offsets = dst.get_minmax_offsets(); - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if (range + 1 < dst_nelems) { - throw py::value_error( - "Destination array can not accommodate all the " - "elements of source array."); - } - } - - int src_typenum = src.get_typenum(); - int dst_typenum = dst.get_typenum(); - - namespace td_ns = dpctl::tensor::type_dispatch; - const auto &array_types = td_ns::usm_ndarray_types(); - int src_typeid = array_types.typenum_to_lookup_id(src_typenum); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - using dpctl::tensor::py_internal::simplify_iteration_space; - using dpctl::tensor::py_internal::simplify_iteration_space_1; - - auto const &src_shape_vecs = src.get_shape_vector(); - auto const &src_strides_vecs = src.get_strides_vector(); - auto const &dst_strides_vecs = dst.get_strides_vector(); - - int reduction_nd = trailing_dims_to_reduce; - const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; - using shT = std::vector; - shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, - std::end(src_strides_vecs)); - - shT compact_reduction_shape; - shT compact_reduction_src_strides; - py::ssize_t reduction_src_offset(0); - - compact_iteration_space( - reduction_nd, reduction_shape_ptr, reduction_src_strides, - // output - compact_reduction_shape, compact_reduction_src_strides); - - const py::ssize_t *iteration_shape_ptr = src_shape_ptr; - - shT iteration_src_strides(std::begin(src_strides_vecs), - std::begin(src_strides_vecs) + iteration_nd); - shT const &iteration_dst_strides = dst_strides_vecs; - - shT simplified_iteration_shape; - shT simplified_iteration_src_strides; - shT simplified_iteration_dst_strides; - py::ssize_t iteration_src_offset(0); - py::ssize_t iteration_dst_offset(0); - - if (iteration_nd == 0) { - if (dst_nelems != 1) { - throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); - } - iteration_nd = 1; - simplified_iteration_shape.push_back(1); - simplified_iteration_src_strides.push_back(0); - simplified_iteration_dst_strides.push_back(0); - } - else { - simplify_iteration_space(iteration_nd, iteration_shape_ptr, - iteration_src_strides, iteration_dst_strides, - // output - simplified_iteration_shape, - simplified_iteration_src_strides, - simplified_iteration_dst_strides, - iteration_src_offset, iteration_dst_offset); - } - - auto fn = dispatch_table[src_typeid][dst_typeid]; - if (fn == nullptr) { - throw std::runtime_error("Datatypes are not supported"); - } - - std::vector host_task_events{}; - - using dpctl::tensor::offset_utils::device_allocate_and_pack; - - const auto &arrays_metainfo_packing_triple_ = - device_allocate_and_pack( - exec_q, host_task_events, - // iteration metadata - simplified_iteration_shape, simplified_iteration_src_strides, - simplified_iteration_dst_strides, - // reduction metadata - compact_reduction_shape, compact_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } - const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); - - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = - temp_allocation_ptr + 3 * simplified_iteration_shape.size(); - - std::vector all_deps; - all_deps.reserve(depends.size() + 1); - all_deps.resize(depends.size()); - std::copy(depends.begin(), depends.end(), all_deps.begin()); - all_deps.push_back(copy_metadata_ev); - - auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_nd, iter_shape_and_strides, - iteration_src_offset, iteration_dst_offset, - reduction_nd, // number dimensions being reduced - reduction_shape_stride, reduction_src_offset, all_deps); - - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const auto &ctx = exec_q.get_context(); - cgh.host_task([ctx, temp_allocation_ptr] { - sycl::free(temp_allocation_ptr, ctx); - }); - }); - host_task_events.push_back(temp_cleanup_ev); - - sycl::event keep_args_event = - dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); - - return std::make_pair(keep_args_event, comp_ev); -} - -extern void init_reduction_functions(py::module_ m); - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/argmax.cpp b/dpctl/tensor/libtensor/source/reductions/argmax.cpp new file mode 100644 index 0000000000..1d83bf9c2d --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/argmax.cpp @@ -0,0 +1,119 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmax_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmax_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgmaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgmaxOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmax_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgmaxOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmax_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmax(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmax_over_axis_dispatch_tables; + populate_argmax_over_axis_dispatch_tables(); + using impl::argmax_over_axis0_contig_temps_dispatch_table; + using impl::argmax_over_axis1_contig_temps_dispatch_table; + using impl::argmax_over_axis_strided_temps_dispatch_table; + + auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmax_over_axis_strided_temps_dispatch_table, + argmax_over_axis0_contig_temps_dispatch_table, + argmax_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/argmax.hpp b/dpctl/tensor/libtensor/source/reductions/argmax.hpp new file mode 100644 index 0000000000..9958396b43 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/argmax.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_argmax(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.cpp b/dpctl/tensor/libtensor/source/reductions/argmin.cpp new file mode 100644 index 0000000000..c6469e6864 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/argmin.cpp @@ -0,0 +1,119 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::search_strided_impl_fn_ptr; +static search_strided_impl_fn_ptr + argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +using dpctl::tensor::kernels::search_contig_impl_fn_ptr; +static search_contig_impl_fn_ptr + argmin_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmin_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgminOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgminOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(argmin_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::ArgminOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(argmin_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_argmin(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_argmin_over_axis_dispatch_tables; + populate_argmin_over_axis_dispatch_tables(); + using impl::argmin_over_axis0_contig_temps_dispatch_table; + using impl::argmin_over_axis1_contig_temps_dispatch_table; + using impl::argmin_over_axis_strided_temps_dispatch_table; + + auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmin_over_axis_strided_temps_dispatch_table, + argmin_over_axis0_contig_temps_dispatch_table, + argmin_over_axis1_contig_temps_dispatch_table); + }; + m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.hpp b/dpctl/tensor/libtensor/source/reductions/argmin.hpp new file mode 100644 index 0000000000..ea6ef1931c --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/argmin.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_argmin(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp new file mode 100644 index 0000000000..e3b015a4e0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp @@ -0,0 +1,136 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + logsumexp_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + logsumexp_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_logsumexp_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::LogSumExpOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table( + logsumexp_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::LogSumExpOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table( + logsumexp_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::LogSumExpOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table( + logsumexp_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_logsumexp(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_logsumexp_over_axis_dispatch_tables; + populate_logsumexp_over_axis_dispatch_tables(); + using impl::logsumexp_over_axis0_contig_temps_dispatch_table; + using impl::logsumexp_over_axis1_contig_temps_dispatch_table; + using impl::logsumexp_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto logsumexp_pyapi = [&](const arrayT &src, + int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_tree_reduction_over_axis; + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + logsumexp_over_axis_strided_temps_dispatch_table, + logsumexp_over_axis0_contig_temps_dispatch_table, + logsumexp_over_axis1_contig_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis", logsumexp_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto logsumexp_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + logsumexp_over_axis_strided_temps_dispatch_table); + }; + m.def("_logsumexp_over_axis_dtype_supported", logsumexp_dtype_supported, + "", py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp new file mode 100644 index 0000000000..46b2156f46 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/logsumexp.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_logsumexp(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/max.cpp b/dpctl/tensor/libtensor/source/reductions/max.cpp new file mode 100644 index 0000000000..32c60b943b --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/max.cpp @@ -0,0 +1,171 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_max_over_axis_dispatch_tables(void) +{ + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MaxOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(max_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(max_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t max_atomic_support_vector[td_ns::num_types]; + +void populate_max_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MaxAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(max_atomic_support_vector); +} + +} // namespace impl + +void init_max(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_max_over_axis_dispatch_tables; + populate_max_over_axis_dispatch_tables(); + using impl::max_over_axis0_contig_atomic_dispatch_table; + using impl::max_over_axis0_contig_temps_dispatch_table; + using impl::max_over_axis1_contig_atomic_dispatch_table; + using impl::max_over_axis1_contig_temps_dispatch_table; + using impl::max_over_axis_strided_atomic_dispatch_table; + using impl::max_over_axis_strided_temps_dispatch_table; + + using impl::populate_max_atomic_support_dispatch_vector; + populate_max_atomic_support_dispatch_vector(); + using impl::max_atomic_support_vector; + + auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + max_over_axis_strided_atomic_dispatch_table, + max_over_axis0_contig_atomic_dispatch_table, + max_over_axis1_contig_atomic_dispatch_table, + max_over_axis_strided_temps_dispatch_table, + max_over_axis0_contig_temps_dispatch_table, + max_over_axis1_contig_temps_dispatch_table, + max_atomic_support_vector); + }; + m.def("_max_over_axis", max_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/max.hpp b/dpctl/tensor/libtensor/source/reductions/max.hpp new file mode 100644 index 0000000000..05a31fc1fb --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/max.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_max(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/min.cpp b/dpctl/tensor/libtensor/source/reductions/min.cpp new file mode 100644 index 0000000000..de1a81387d --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/min.cpp @@ -0,0 +1,173 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_min_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MinOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(min_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(min_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t min_atomic_support_vector[td_ns::num_types]; + +void populate_min_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::MinAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(min_atomic_support_vector); +} + +} // namespace impl + +void init_min(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_min_over_axis_dispatch_tables; + populate_min_over_axis_dispatch_tables(); + using impl::min_over_axis0_contig_atomic_dispatch_table; + using impl::min_over_axis0_contig_temps_dispatch_table; + using impl::min_over_axis1_contig_atomic_dispatch_table; + using impl::min_over_axis1_contig_temps_dispatch_table; + using impl::min_over_axis_strided_atomic_dispatch_table; + using impl::min_over_axis_strided_temps_dispatch_table; + + using impl::populate_min_atomic_support_dispatch_vector; + populate_min_atomic_support_dispatch_vector(); + using impl::min_atomic_support_vector; + + auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + min_over_axis_strided_atomic_dispatch_table, + min_over_axis0_contig_atomic_dispatch_table, + min_over_axis1_contig_atomic_dispatch_table, + min_over_axis_strided_temps_dispatch_table, + min_over_axis0_contig_temps_dispatch_table, + min_over_axis1_contig_temps_dispatch_table, + min_atomic_support_vector); + }; + m.def("_min_over_axis", min_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/min.hpp b/dpctl/tensor/libtensor/source/reductions/min.hpp new file mode 100644 index 0000000000..cad94c7533 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/min.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_min(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/prod.cpp b/dpctl/tensor/libtensor/source/reductions/prod.cpp new file mode 100644 index 0000000000..a90d04304a --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/prod.cpp @@ -0,0 +1,187 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_prod_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::ProductOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(prod_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(prod_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t prod_atomic_support_vector[td_ns::num_types]; + +void populate_prod_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::ProductAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(prod_atomic_support_vector); +} + +} // namespace impl + +void init_prod(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_prod_over_axis_dispatch_tables; + populate_prod_over_axis_dispatch_tables(); + using impl::prod_over_axis0_contig_atomic_dispatch_table; + using impl::prod_over_axis0_contig_temps_dispatch_table; + using impl::prod_over_axis1_contig_atomic_dispatch_table; + using impl::prod_over_axis1_contig_temps_dispatch_table; + using impl::prod_over_axis_strided_atomic_dispatch_table; + using impl::prod_over_axis_strided_temps_dispatch_table; + + using impl::populate_prod_atomic_support_dispatch_vector; + populate_prod_atomic_support_dispatch_vector(); + using impl::prod_atomic_support_vector; + + auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis0_contig_atomic_dispatch_table, + prod_over_axis1_contig_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_over_axis0_contig_temps_dispatch_table, + prod_over_axis1_contig_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto prod_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_atomic_support_vector); + }; + m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/prod.hpp b/dpctl/tensor/libtensor/source/reductions/prod.hpp new file mode 100644 index 0000000000..026e7d8923 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/prod.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_prod(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp new file mode 100644 index 0000000000..c7313930b4 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp @@ -0,0 +1,132 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "utils/type_dispatch.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + hypot_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + hypot_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + hypot_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_hypot_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::HypotOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(hypot_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::HypotOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(hypot_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::HypotOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(hypot_over_axis0_contig_temps_dispatch_table); +} + +} // namespace impl + +void init_reduce_hypot(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_hypot_over_axis_dispatch_tables; + populate_hypot_over_axis_dispatch_tables(); + using impl::hypot_over_axis0_contig_temps_dispatch_table; + using impl::hypot_over_axis1_contig_temps_dispatch_table; + using impl::hypot_over_axis_strided_temps_dispatch_table; + + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + + auto hypot_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_tree_reduction_over_axis; + return py_tree_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + hypot_over_axis_strided_temps_dispatch_table, + hypot_over_axis0_contig_temps_dispatch_table, + hypot_over_axis1_contig_temps_dispatch_table); + }; + m.def("_hypot_over_axis", hypot_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto hypot_dtype_supported = [&](const py::dtype &input_dtype, + const py::dtype &output_dtype) { + using dpctl::tensor::py_internal::py_tree_reduction_dtype_supported; + return py_tree_reduction_dtype_supported( + input_dtype, output_dtype, + hypot_over_axis_strided_temps_dispatch_table); + }; + m.def("_hypot_over_axis_dtype_supported", hypot_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp new file mode 100644 index 0000000000..92b7fac363 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_reduce_hypot(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp new file mode 100644 index 0000000000..695f4b73d0 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -0,0 +1,143 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include +#include +#include + +#include "utils/type_utils.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ +namespace atomic_support +{ + +typedef bool (*atomic_support_fn_ptr_t)(const sycl::queue &, sycl::usm::alloc); + +/*! @brief Function which returns a constant value for atomic support */ +template +bool fixed_decision(const sycl::queue &, sycl::usm::alloc) +{ + return return_value; +} + +/*! @brief Template for querying atomic support for a type on a device */ +template +bool check_atomic_support(const sycl::queue &exec_q, + sycl::usm::alloc usm_alloc_type) +{ + constexpr bool atomic32 = (sizeof(T) == 4); + constexpr bool atomic64 = (sizeof(T) == 8); + using dpctl::tensor::type_utils::is_complex; + if constexpr ((!atomic32 && !atomic64) || is_complex::value) { + return fixed_decision(exec_q, usm_alloc_type); + } + else { + bool supports_atomics = false; + const sycl::device &dev = exec_q.get_device(); + if constexpr (atomic64) { + if (!dev.has(sycl::aspect::atomic64)) { + return false; + } + } + switch (usm_alloc_type) { + case sycl::usm::alloc::shared: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_shared_allocations); + break; + case sycl::usm::alloc::host: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_host_allocations); + break; + case sycl::usm::alloc::device: + supports_atomics = true; + break; + default: + supports_atomics = false; + } + return supports_atomics; + } +} + +template struct ArithmeticAtomicSupportFactory +{ + fnT get() + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (std::is_floating_point_v || + std::is_same_v || is_complex::value) + { + // for real- and complex- floating point types, tree reduction has + // better round-off accumulation properties (round-off error is + // proportional to the log2(reduction_size), while naive elementwise + // summation used by atomic implementation has round-off error + // growing proportional to the reduction_size.), hence reduction + // over floating point types should always use tree_reduction + // algorithm, even though atomic implementation may be applicable + return fixed_decision; + } + else { + return check_atomic_support; + } + } +}; + +template struct MinMaxAtomicSupportFactory +{ + fnT get() + { + return check_atomic_support; + } +}; + +template +struct MaxAtomicSupportFactory : public ArithmeticAtomicSupportFactory +{ +}; + +template +struct MinAtomicSupportFactory : public ArithmeticAtomicSupportFactory +{ +}; + +template +struct SumAtomicSupportFactory : public ArithmeticAtomicSupportFactory +{ +}; + +template +struct ProductAtomicSupportFactory + : public ArithmeticAtomicSupportFactory +{ +}; + +} // namespace atomic_support +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp new file mode 100644 index 0000000000..99edf663ad --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_common.cpp @@ -0,0 +1,60 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include + +#include "argmax.hpp" +#include "argmin.hpp" +#include "logsumexp.hpp" +#include "max.hpp" +#include "min.hpp" +#include "prod.hpp" +#include "reduce_hypot.hpp" +#include "sum.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +/*! @brief Add reduction functions to Python module */ +void init_reduction_functions(py::module_ m) +{ + init_argmax(m); + init_argmin(m); + init_logsumexp(m); + init_max(m); + init_min(m); + init_prod(m); + init_reduce_hypot(m); + init_sum(m); +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp new file mode 100644 index 0000000000..61c992364a --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_common.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_reduction_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp new file mode 100644 index 0000000000..da8da0938d --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -0,0 +1,1095 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for reductions. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +/* ====================== dtype supported ======================== */ + +/*! @brief Template implementing Python API for querying type support by + * reduction which may support atomics */ +template +bool py_reduction_dtype_supported( + const py::dtype &input_dtype, + const py::dtype &output_dtype, + const std::string &dst_usm_type, + sycl::queue &q, + const fnT &atomic_dispatch_table, + const fnT &temps_dispatch_table, + const CheckAtomicSupportFnT &check_atomic_support) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + sycl::usm::alloc kind = sycl::usm::alloc::unknown; + + if (dst_usm_type == "device") { + kind = sycl::usm::alloc::device; + } + else if (dst_usm_type == "shared") { + kind = sycl::usm::alloc::shared; + } + else if (dst_usm_type == "host") { + kind = sycl::usm::alloc::host; + } + else { + throw py::value_error("Unrecognized `dst_usm_type` argument."); + } + + bool supports_atomics = check_atomic_support[out_typeid](q, kind); + + if (supports_atomics) { + fn = atomic_dispatch_table[arg_typeid][out_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[arg_typeid][out_typeid]; + } + + return (fn != nullptr); +} + +/*! @brief Template implementing Python API for querying type support by tree + * reduction */ +template +bool py_tree_reduction_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const fnT &temps_dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + auto fn = temps_dispatch_table[arg_typeid][out_typeid]; + + return (fn != nullptr); +} + +/* ==================== Generic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis which may + * support atomics */ +template +std::pair py_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &atomic_dispatch_table, + const contig_fnT &axis0_atomic_dispatch_table, + const contig_fnT &axis1_atomic_dispatch_table, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table, + const SupportAtomicFnT &check_atomic_support) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + + bool supports_atomics = check_atomic_support[dst_typeid](exec_q, usm_type); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + // remove_all_extents gets underlying type of table + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(simplified_reduction_src_strides[0]) == + iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + using contig_fn_ptr_T = + typename std::remove_all_extents::type; + contig_fn_ptr_T fn; + if (supports_atomics) { + fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + } + else { + fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + } + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + // remove_all_extents gets underlying type of table + using strided_fn_ptr_T = + typename std::remove_all_extents::type; + strided_fn_ptr_T fn = nullptr; + + if (supports_atomics) { + fn = atomic_dispatch_table[src_typeid][dst_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(reduction_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/* ================= No atomic reductions ====================== */ + +/*! @brief Template implementing Python API for reduction over axis without + * atomics */ +template +std::pair py_tree_reduction_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_temps_dispatch_table, + const contig_fnT &axis1_temps_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(simplified_reduction_src_strides[0]) == + iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + using dpctl::tensor::offset_utils::device_allocate_and_pack; + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(reduction_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +/*! @brief Template implementing Python API for searching over an axis */ +template +std::pair py_search_over_axis( + const dpctl::tensor::usm_ndarray &src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, + const std::vector &depends, + const strided_fnT &strided_dispatch_table, + const contig_fnT &axis0_contig_dispatch_table, + const contig_fnT &axis1_contig_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + // handle special case when both reduction and iteration are 1D contiguous + // and can be done with atomics + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT compact_reduction_shape; + shT compact_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + compact_iteration_space( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + compact_reduction_shape, compact_reduction_src_strides); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if ((reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + size_t iter_nelems = dst_nelems; + + if (compact_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(compact_reduction_src_strides[0]) == + iter_nelems) { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + auto fn = strided_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + compact_reduction_shape, compact_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_nd, iter_shape_and_strides, + iteration_src_offset, iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + const auto &ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, comp_ev); +} + +extern void init_reduction_functions(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/sum.cpp b/dpctl/tensor/libtensor/source/reductions/sum.cpp new file mode 100644 index 0000000000..33803cfd7b --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/sum.cpp @@ -0,0 +1,187 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include "dpctl4pybind11.hpp" +#include +#include +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "utils/type_dispatch.hpp" + +#include "reduction_atomic_support.hpp" +#include "reduction_over_axis.hpp" + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_sum_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis1TempsContigFactory; + DispatchTableBuilder + dtb5; + dtb5.populate_dispatch_table(sum_over_axis1_contig_temps_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis0TempsContigFactory; + DispatchTableBuilder + dtb6; + dtb6.populate_dispatch_table(sum_over_axis0_contig_temps_dispatch_table); +} + +using atomic_support::atomic_support_fn_ptr_t; +static atomic_support_fn_ptr_t sum_atomic_support_vector[td_ns::num_types]; + +void populate_sum_atomic_support_dispatch_vector(void) +{ + using td_ns::DispatchVectorBuilder; + + using atomic_support::SumAtomicSupportFactory; + DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(sum_atomic_support_vector); +} + +} // namespace impl + +void init_sum(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + { + using impl::populate_sum_over_axis_dispatch_tables; + populate_sum_over_axis_dispatch_tables(); + using impl::sum_over_axis0_contig_atomic_dispatch_table; + using impl::sum_over_axis0_contig_temps_dispatch_table; + using impl::sum_over_axis1_contig_atomic_dispatch_table; + using impl::sum_over_axis1_contig_temps_dispatch_table; + using impl::sum_over_axis_strided_atomic_dispatch_table; + using impl::sum_over_axis_strided_temps_dispatch_table; + + using impl::populate_sum_atomic_support_dispatch_vector; + populate_sum_atomic_support_dispatch_vector(); + using impl::sum_atomic_support_vector; + + auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis0_contig_atomic_dispatch_table, + sum_over_axis1_contig_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_over_axis0_contig_temps_dispatch_table, + sum_over_axis1_contig_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sum_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_atomic_support_vector); + }; + m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reductions/sum.hpp b/dpctl/tensor/libtensor/source/reductions/sum.hpp new file mode 100644 index 0000000000..ded0d14809 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reductions/sum.hpp @@ -0,0 +1,41 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_sum(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 254856ec38..d07d5cf084 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -48,7 +48,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" -#include "reduction_over_axis.hpp" +#include "reductions/reduction_common.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index f6d1ca086b..a4e202f073 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -173,6 +173,21 @@ def test_largish_reduction(arg_dtype, n): assert dpt.all(dpt.equal(y1, n * m)) +@pytest.mark.parametrize("n", [1023, 1024, 1025]) +def test_largish_reduction_axis1_axis0(n): + get_queue_or_skip() + + m = 25 + x1 = dpt.ones((m, n), dtype="f4") + x2 = dpt.ones((n, m), dtype="f4") + + y1 = dpt.sum(x1, axis=1) + y2 = dpt.sum(x2, axis=0) + + assert dpt.all(y1 == n) + assert dpt.all(y2 == n) + + def test_axis0_bug(): "gh-1391" get_queue_or_skip() diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index 8d66f35d71..73cf9459a7 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -18,10 +18,32 @@ import numpy as np import pytest +from numpy.testing import assert_allclose import dpctl.tensor as dpt from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported +_no_complex_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", +] + + +_all_dtypes = _no_complex_dtypes + [ + "c8", + "c16", +] + def test_max_min_axis(): get_queue_or_skip() @@ -234,3 +256,176 @@ def test_reduction_arg_validation(): dpt.max(x) with pytest.raises(ValueError): dpt.argmax(x) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +def test_logsumexp_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.logsumexp(m) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype.kind == "f" + tol = dpt.finfo(r.dtype).resolution + assert_allclose( + dpt.asnumpy(r), + np.logaddexp.reduce(dpt.asnumpy(m), dtype=r.dtype), + rtol=tol, + atol=tol, + ) + + +def test_logsumexp_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + y = dpt.logsumexp(x) + assert y.shape == tuple() + assert y == -dpt.inf + + +def test_logsumexp_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + s = dpt.logsumexp(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + tol = dpt.finfo(s.dtype).resolution + assert_allclose( + dpt.asnumpy(s), + np.logaddexp.reduce(dpt.asnumpy(m), axis=(1, 2, -1), dtype=s.dtype), + rtol=tol, + atol=tol, + ) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_logsumexp_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.logsumexp(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + + +def test_logsumexp_keepdims(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.logsumexp(m, axis=(1, 2, -1), keepdims=True) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + + +def test_logsumexp_keepdims_zero_size(): + get_queue_or_skip() + n = 10 + a = dpt.ones((n, 0, n)) + + s1 = dpt.logsumexp(a, keepdims=True) + assert s1.shape == (1, 1, 1) + + s2 = dpt.logsumexp(a, axis=(0, 1), keepdims=True) + assert s2.shape == (1, 1, n) + + s3 = dpt.logsumexp(a, axis=(1, 2), keepdims=True) + assert s3.shape == (n, 1, 1) + + s4 = dpt.logsumexp(a, axis=(0, 2), keepdims=True) + assert s4.shape == (1, 0, 1) + + a0 = a[0] + s5 = dpt.logsumexp(a0, keepdims=True) + assert s5.shape == (1, 1) + + +def test_logsumexp_scalar(): + get_queue_or_skip() + + m = dpt.ones(()) + s = dpt.logsumexp(m) + + assert isinstance(s, dpt.usm_ndarray) + assert m.sycl_queue == s.sycl_queue + assert s.shape == () + + +def test_logsumexp_complex(): + get_queue_or_skip() + + x = dpt.zeros(1, dtype="c8") + with pytest.raises(TypeError): + dpt.logsumexp(x) + + +def test_logsumexp_int_axis(): + get_queue_or_skip() + + x = dpt.zeros((8, 10), dtype="f4") + res = dpt.logsumexp(x, axis=0) + assert res.ndim == 1 + assert res.shape[0] == 10 + + +def test_logsumexp_invalid_arr(): + x = dict() + with pytest.raises(TypeError): + dpt.logsumexp(x) + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +def test_hypot_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.reduce_hypot(m) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype.kind == "f" + tol = dpt.finfo(r.dtype).resolution + assert_allclose( + dpt.asnumpy(r), + np.hypot.reduce(dpt.asnumpy(m), dtype=r.dtype), + rtol=tol, + atol=tol, + ) + + +def test_hypot_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + y = dpt.reduce_hypot(x) + assert y.shape == tuple() + assert y == 0 + + +@pytest.mark.parametrize("arg_dtype", _no_complex_dtypes[1:]) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_hypot_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.reduce_hypot(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + + +def test_hypot_complex(): + get_queue_or_skip() + + x = dpt.zeros(1, dtype="c8") + with pytest.raises(TypeError): + dpt.reduce_hypot(x) From b3e9465c30c92977f76ad956334f7d1e7b9352f2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 26 Oct 2023 12:34:34 -0500 Subject: [PATCH 55/83] Implementations of reductions for contigous case must take offsets into account --- .../libtensor/include/kernels/reductions.hpp | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index b9e2918c8c..d5fddce6ed 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1339,7 +1339,7 @@ sycl::event reduction_over_group_temps_strided_impl( static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, /* shape */ iter_shape_and_strides, - /*s trides */ iter_shape_and_strides + + /* strides */ iter_shape_and_strides + 2 * iter_nd}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1424,8 +1424,9 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( py::ssize_t reduction_arg_offset, const std::vector &depends) { - const argTy *arg_tp = reinterpret_cast(arg_cp); - resTy *res_tp = reinterpret_cast(res_cp); + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; constexpr resTy identity_val = su_ns::Identity::value; @@ -1767,8 +1768,9 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( py::ssize_t reduction_arg_offset, const std::vector &depends) { - const argTy *arg_tp = reinterpret_cast(arg_cp); - resTy *res_tp = reinterpret_cast(res_cp); + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; constexpr resTy identity_val = su_ns::Identity::value; @@ -4258,8 +4260,9 @@ sycl::event search_axis1_over_group_temps_contig_impl( py::ssize_t reduction_arg_offset, const std::vector &depends) { - const argTy *arg_tp = reinterpret_cast(arg_cp); - resTy *res_tp = reinterpret_cast(res_cp); + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; constexpr argTy identity_val = su_ns::Identity::value; constexpr resTy idx_identity_val = su_ns::Identity::value; @@ -4635,8 +4638,9 @@ sycl::event search_axis0_over_group_temps_contig_impl( py::ssize_t reduction_arg_offset, const std::vector &depends) { - const argTy *arg_tp = reinterpret_cast(arg_cp); - resTy *res_tp = reinterpret_cast(res_cp); + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; constexpr argTy identity_val = su_ns::Identity::value; constexpr resTy idx_identity_val = su_ns::Identity::value; From c63c5451ec92620956d106df8f68ec5d8bc58680 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 26 Oct 2023 12:35:11 -0500 Subject: [PATCH 56/83] Expand test to cover non-contig. input that can be simplified into one --- dpctl/tests/test_tensor_sum.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index a4e202f073..fbfd9547e1 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -43,6 +43,7 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(arg_dtype, q) + # test reduction for C-contiguous input m = dpt.ones(100, dtype=arg_dtype) r = dpt.sum(m) @@ -55,12 +56,20 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype): assert r.dtype.kind == "f" elif m.dtype.kind == "c": assert r.dtype.kind == "c" + assert dpt.all(r == 100) + # test reduction for strided input m = dpt.ones(200, dtype=arg_dtype)[:1:-2] r = dpt.sum(m) assert dpt.all(r == 99) + # test reduction for strided input which can be simplified + # to contiguous computation + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.sum(dpt.flip(m)) + assert dpt.all(r == 100) + @pytest.mark.parametrize("arg_dtype", _all_dtypes) @pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) From e92d1f9ad3a138c9c85e181c0e31e7293a4b8eb2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 26 Oct 2023 12:43:05 -0500 Subject: [PATCH 57/83] Add tests for strided input where contig implementation is applicable --- dpctl/tests/test_usm_ndarray_reductions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index 73cf9459a7..d500ce26b6 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -169,6 +169,14 @@ def test_search_reduction_kernels(arg_dtype): m = dpt.argmax(x) assert m == idx + m = dpt.argmax(dpt.flip(x)) + assert m == x.size - 1 - idx + + y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q) + y[::2] = x + m = dpt.argmax(y) + assert m == 2 * idx + x = dpt.reshape(x, (24, 1025)) x[idx_tup[0], :] = 3 From 702b707250d9ab226704d6ce3fee7e2307d0fdbb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 26 Oct 2023 10:59:54 -0700 Subject: [PATCH 58/83] Added comments to the test file --- dpctl/tests/test_usm_ndarray_reductions.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index d500ce26b6..56059e54b8 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -169,9 +169,12 @@ def test_search_reduction_kernels(arg_dtype): m = dpt.argmax(x) assert m == idx + # test case of strided input mapping to contig + # implementation m = dpt.argmax(dpt.flip(x)) assert m == x.size - 1 - idx + # test case of strided implementation y = dpt.ones(2 * x.size, dtype=arg_dtype, sycl_queue=q) y[::2] = x m = dpt.argmax(y) From dcb566a424a902af8ccb7e96021a3899e98c925b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 26 Oct 2023 15:24:17 -0500 Subject: [PATCH 59/83] Corrected logical error in can_use_reduce_over_group trait implementation --- dpctl/tensor/libtensor/include/kernels/reductions.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index d5fddce6ed..884a7c5461 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -50,12 +50,18 @@ namespace tensor namespace kernels { +template struct needs_workaround +{ + static constexpr bool value = + std::is_same_v> && + (std::is_same_v || std::is_same_v); +}; + template struct can_use_reduce_over_group { static constexpr bool value = sycl::has_known_identity::value && - !std::is_same_v && !std::is_same_v && - !std::is_same_v>; + !needs_workaround::value; }; template Date: Thu, 26 Oct 2023 17:42:26 -0500 Subject: [PATCH 60/83] The taper optimization in tree-reduction which causes problem with CUDA The optimization should not use max-work-group-size, to allow RT some of the SLM memory. --- .../libtensor/include/kernels/reductions.hpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 884a7c5461..40e5bd282d 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1094,7 +1094,7 @@ sycl::event reduction_over_group_temps_strided_impl( // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( - max_max_wg, d.get_info()); + max_max_wg, d.get_info() / 2); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { @@ -1444,7 +1444,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( - max_max_wg, d.get_info()); + max_max_wg, d.get_info() / 2); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { @@ -1788,7 +1788,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( - max_max_wg, d.get_info()); + max_max_wg, d.get_info() / 2); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { @@ -3883,8 +3883,9 @@ sycl::event search_over_group_temps_strided_impl( constexpr size_t preferrered_reductions_per_wi = 4; // max_max_wg prevents running out of resources on CPU - size_t max_wg = std::min( - size_t(2048), d.get_info()); + size_t max_wg = + std::min(size_t(2048), + d.get_info() / 2); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { @@ -4279,8 +4280,9 @@ sycl::event search_axis1_over_group_temps_contig_impl( constexpr size_t preferrered_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU - size_t max_wg = std::min( - size_t(2048), d.get_info()); + size_t max_wg = + std::min(size_t(2048), + d.get_info() / 2); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { @@ -4657,8 +4659,9 @@ sycl::event search_axis0_over_group_temps_contig_impl( constexpr size_t preferrered_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU - size_t max_wg = std::min( - size_t(2048), d.get_info()); + size_t max_wg = + std::min(size_t(2048), + d.get_info() / 2); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { From 02e7714c8041b268497260b70453af60fc020646 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Fri, 27 Oct 2023 15:52:04 -0700 Subject: [PATCH 61/83] Fix ``axis0`` calls in reduction Python binding (#1459) * max and min now use MinMaxAtomicSupportFactory These functions were using ArithmeticAtomicSupportFactory, which disables atomics for floating point types * Resolves #1455 This issue was caused by a typo where when the `axis0` kernels for tree and atomic reductions would be called, the `axis1` kernel would be called instead * Adds tests for #1455 resolution --- .../reductions/reduction_atomic_support.hpp | 4 +- .../source/reductions/reduction_over_axis.hpp | 7 ++-- dpctl/tests/test_tensor_sum.py | 30 ++++++++++++++ dpctl/tests/test_usm_ndarray_reductions.py | 39 +++++++++++++++++++ 4 files changed, 74 insertions(+), 6 deletions(-) diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp index 695f4b73d0..2478545efe 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -117,12 +117,12 @@ template struct MinMaxAtomicSupportFactory }; template -struct MaxAtomicSupportFactory : public ArithmeticAtomicSupportFactory +struct MaxAtomicSupportFactory : public MinMaxAtomicSupportFactory { }; template -struct MinAtomicSupportFactory : public ArithmeticAtomicSupportFactory +struct MinAtomicSupportFactory : public MinMaxAtomicSupportFactory { }; diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index da8da0938d..aa46f1c02a 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -417,10 +417,10 @@ std::pair py_reduction_over_axis( typename std::remove_all_extents::type; contig_fn_ptr_T fn; if (supports_atomics) { - fn = axis1_atomic_dispatch_table[src_typeid][dst_typeid]; + fn = axis0_atomic_dispatch_table[src_typeid][dst_typeid]; } else { - fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; } if (fn != nullptr) { sycl::event reduction_over_axis0_contig_ev = @@ -727,7 +727,7 @@ std::pair py_tree_reduction_over_axis( } } else if (mat_reduce_over_axis0) { - auto fn = axis1_temps_dispatch_table[src_typeid][dst_typeid]; + auto fn = axis0_temps_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { sycl::event reduction_over_axis0_contig_ev = fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), @@ -929,7 +929,6 @@ std::pair py_search_over_axis( } using dpctl::tensor::py_internal::simplify_iteration_space; - using dpctl::tensor::py_internal::simplify_iteration_space_1; auto const &src_shape_vecs = src.get_shape_vector(); auto const &src_strides_vecs = src.get_strides_vector(); diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index fbfd9547e1..749ca055b9 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -212,6 +212,36 @@ def test_axis0_bug(): assert dpt.all(s == expected) +def test_sum_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + # The atomic case is checked in `test_usm_ndarray_reductions` + # This test checks the tree reduction path for correctness + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + + m = dpt.sum(x, axis=0) + expected = dpt.asarray( + [ + [60, 63, 66, 69, 72], + [75, 78, 81, 84, 87], + [90, 93, 96, 99, 102], + [105, 108, 111, 114, 117], + ], + dtype="f4", + ) + tol = dpt.finfo(m.dtype).resolution + assert dpt.allclose(m, expected, atol=tol, rtol=tol) + + x = dpt.flip(x, axis=2) + m = dpt.sum(x, axis=2) + expected = dpt.asarray( + [[10, 35, 60, 85], [110, 135, 160, 185], [210, 235, 260, 285]], + dtype="f4", + ) + assert dpt.allclose(m, expected, atol=tol, rtol=tol) + + def _any_complex(dtypes): return any(dpt.isdtype(dpt.dtype(dt), "complex floating") for dt in dtypes) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index 56059e54b8..45afb26aac 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -61,6 +61,20 @@ def test_max_min_axis(): assert dpt.all(m == x[:, 0, 0, :, 0]) +def test_max_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5), (3, 4, 5)) + + m = dpt.max(x, axis=0) + assert dpt.all(m == x[-1, :, :]) + + x = dpt.flip(x, axis=2) + m = dpt.max(x, axis=2) + assert dpt.all(m == x[:, :, 0]) + + def test_reduction_keepdims(): get_queue_or_skip() @@ -440,3 +454,28 @@ def test_hypot_complex(): x = dpt.zeros(1, dtype="c8") with pytest.raises(TypeError): dpt.reduce_hypot(x) + + +def test_tree_reduction_axis1_axis0(): + """See gh-1455""" + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + + m = dpt.logsumexp(x, axis=0) + tol = dpt.finfo(m.dtype).resolution + assert_allclose( + dpt.asnumpy(m), + np.logaddexp.reduce(dpt.asnumpy(x), axis=0, dtype=m.dtype), + rtol=tol, + atol=tol, + ) + + x = dpt.flip(x, axis=2) + m = dpt.logsumexp(x, axis=2) + assert_allclose( + dpt.asnumpy(m), + np.logaddexp.reduce(dpt.asnumpy(x), axis=2, dtype=m.dtype), + rtol=tol, + atol=tol, + ) From 9131925d6024f9d08e4f8a6c1770326cd2494131 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 31 Oct 2023 04:24:35 -0500 Subject: [PATCH 62/83] Improve raise TypeError by providing exception message (#1460) Closes gh-1457 ``` In [1]: import dpctl.tensor as dpt In [2]: dpt.asnumpy([1,2,3]) --------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[2], line 1 ----> 1 dpt.asnumpy([1,2,3]) File ~/repos/dpctl/dpctl/tensor/_copy_utils.py:185, in asnumpy(usm_ary) 169 def asnumpy(usm_ary): 170 """ 171 asnumpy(usm_ary) 172 (...) 183 of `usm_ary` 184 """ --> 185 return _copy_to_numpy(usm_ary) File ~/repos/dpctl/dpctl/tensor/_copy_utils.py:40, in _copy_to_numpy(ary) 38 def _copy_to_numpy(ary): 39 if not isinstance(ary, dpt.usm_ndarray): ---> 40 raise TypeError( 41 f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}" 42 ) 43 nb = ary.usm_data.nbytes 44 hh = dpm.MemoryUSMHost(nb, queue=ary.sycl_queue) TypeError: Expected dpctl.tensor.usm_ndarray, got In [3]: quit ``` --- dpctl/tensor/_copy_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/_copy_utils.py b/dpctl/tensor/_copy_utils.py index bc1b071460..81928692a6 100644 --- a/dpctl/tensor/_copy_utils.py +++ b/dpctl/tensor/_copy_utils.py @@ -37,7 +37,7 @@ def _copy_to_numpy(ary): if not isinstance(ary, dpt.usm_ndarray): - raise TypeError + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(ary)}") nb = ary.usm_data.nbytes hh = dpm.MemoryUSMHost(nb, queue=ary.sycl_queue) hh.copy_from_device(ary.usm_data) From 11ecba8e282a3d34f1883819d2ed08010bba6036 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Wed, 1 Nov 2023 10:28:39 -0700 Subject: [PATCH 63/83] Fix search reductions giving incorrect results for F-contiguous inputs (#1462) * Fixes correctness regression in search functions ``py_search_over_axis`` no longer calls the ``axis1`` contiguous variant ``py_search_over_axis`` now only calls ``axis0`` variant wh * Adds tests for fixed search reduction behavior --- .../source/reductions/reduction_over_axis.hpp | 13 +++---------- dpctl/tests/test_usm_ndarray_reductions.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index aa46f1c02a..f1b924dd47 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -874,14 +874,11 @@ std::pair py_search_over_axis( int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); // handle special case when both reduction and iteration are 1D contiguous - // and can be done with atomics bool is_src_c_contig = src.is_c_contiguous(); bool is_dst_c_contig = dst.is_c_contiguous(); bool is_src_f_contig = src.is_f_contiguous(); - if ((is_src_c_contig && is_dst_c_contig) || - (is_src_f_contig && dst_nelems == 1)) - { + if (is_src_c_contig && is_dst_c_contig) { auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { size_t iter_nelems = dst_nelems; @@ -903,9 +900,7 @@ std::pair py_search_over_axis( reduction_over_axis_contig_ev); } } - else if (is_src_f_contig && - ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) - { + else if (is_src_f_contig && dst_nd == 1) { auto fn = axis0_contig_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { size_t iter_nelems = dst_nelems; @@ -983,11 +978,9 @@ std::pair py_search_over_axis( if ((reduction_nd == 1) && (iteration_nd == 1)) { bool mat_reduce_over_axis1 = false; bool mat_reduce_over_axis0 = false; - bool array_reduce_all_elems = false; size_t iter_nelems = dst_nelems; if (compact_reduction_src_strides[0] == 1) { - array_reduce_all_elems = (simplified_iteration_shape[0] == 1); mat_reduce_over_axis1 = (simplified_iteration_dst_strides[0] == 1) && (static_cast(simplified_iteration_src_strides[0]) == @@ -1000,7 +993,7 @@ std::pair py_search_over_axis( (simplified_iteration_src_strides[0] == 1); } - if (mat_reduce_over_axis1 || array_reduce_all_elems) { + if (mat_reduce_over_axis1) { auto fn = axis1_contig_dispatch_table[src_typeid][dst_typeid]; if (fn != nullptr) { sycl::event reduction_over_axis1_contig_ev = diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index 45afb26aac..cbfd6baec6 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -265,6 +265,22 @@ def test_argmax_argmin_identities(): assert dpt.argmin(x) == 0 +@pytest.mark.parametrize("order", ["C", "F"]) +def test_argmax_axis0_axis1(order): + get_queue_or_skip() + + x = dpt.asarray([[1, 2, 3], [6, 5, 4]], dtype="i4", order=order) + assert dpt.argmax(x) == 3 + + res = dpt.argmax(x, axis=0) + expected = dpt.asarray([1, 1, 1], dtype=res.dtype) + assert dpt.all(res == expected) + + res = dpt.argmax(x, axis=1) + expected = dpt.asarray([2, 0], dtype=res.dtype) + assert dpt.all(res == expected) + + def test_reduction_arg_validation(): get_queue_or_skip() From c742e79cdfd3426fdbe73092091baad3b15abae9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 2 Nov 2023 05:44:38 -0500 Subject: [PATCH 64/83] Use SequentialReductionKernel for tree-reduction as well 1. Renamed misspelled variable 2. If reduction_nelems is small, used SequentialReductionKernel for tree-reductions as it is done for atomic reduction 3. Tweak scaling down logic for moderately-sized number of elements to reduce. We should also use max_wg if the iter_nelems is very small (one), since choosing max_wg for large iter_nelems may lead to under- utilization of GPU. --- .../libtensor/include/kernels/reductions.hpp | 311 +++++++++++------- 1 file changed, 194 insertions(+), 117 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 40e5bd282d..7c40623048 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -477,11 +477,11 @@ sycl::event reduction_over_group_with_atomics_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - constexpr size_t preferrered_reductions_per_wi = 4; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -619,11 +619,11 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( result_indexer}; ReductionIndexerT reduction_indexer{}; - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -718,11 +718,11 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( 0, /* size */ static_cast(reduction_nelems), /* step */ static_cast(iter_nelems)}; - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) + (reduction_nelems < preferred_reductions_per_wi * wg) ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; + : preferred_reductions_per_wi; size_t reduction_groups = (reduction_nelems + reductions_per_wi * wg - 1) / @@ -1090,15 +1090,44 @@ sycl::event reduction_over_group_temps_strided_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 4; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( max_max_wg, d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -1113,7 +1142,10 @@ sycl::event reduction_over_group_temps_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -1164,13 +1196,13 @@ sycl::event reduction_over_group_temps_strided_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -1227,7 +1259,7 @@ sycl::event reduction_over_group_temps_strided_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1244,7 +1276,7 @@ sycl::event reduction_over_group_temps_strided_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -1255,11 +1287,10 @@ sycl::event reduction_over_group_temps_strided_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1302,7 +1333,7 @@ sycl::event reduction_over_group_temps_strided_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1319,7 +1350,7 @@ sycl::event reduction_over_group_temps_strided_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -1440,15 +1471,47 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( max_max_wg, d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -1466,7 +1529,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -1518,13 +1584,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -1575,7 +1641,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1592,7 +1658,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -1603,11 +1669,10 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1650,7 +1715,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1667,7 +1732,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -1784,15 +1849,16 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; size_t max_wg = std::min( max_max_wg, d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -1811,7 +1877,10 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( 0, /* size */ static_cast(reduction_nelems), /* step */ static_cast(iter_nelems)}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -1863,13 +1932,13 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -1920,7 +1989,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -1937,7 +2006,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -1948,11 +2017,10 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -1995,7 +2063,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -2012,7 +2080,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( temp_arg, temp2_arg, ReductionOpT(), identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -3881,15 +3949,16 @@ sycl::event search_over_group_temps_strided_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 4; + constexpr size_t preferred_reductions_per_wi = 4; // max_max_wg prevents running out of resources on CPU size_t max_wg = std::min(size_t(2048), d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -3904,7 +3973,10 @@ sycl::event search_over_group_temps_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -3956,13 +4028,13 @@ sycl::event search_over_group_temps_strided_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -4031,7 +4103,7 @@ sycl::event search_over_group_temps_strided_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4050,7 +4122,7 @@ sycl::event search_over_group_temps_strided_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -4065,11 +4137,10 @@ sycl::event search_over_group_temps_strided_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -4114,7 +4185,7 @@ sycl::event search_over_group_temps_strided_impl( ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4135,7 +4206,7 @@ sycl::event search_over_group_temps_strided_impl( idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -4278,15 +4349,16 @@ sycl::event search_axis1_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU size_t max_wg = std::min(size_t(2048), d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -4304,7 +4376,10 @@ sycl::event search_axis1_over_group_temps_contig_impl( NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -4356,13 +4431,13 @@ sycl::event search_axis1_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -4425,7 +4500,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4444,7 +4519,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -4459,11 +4534,10 @@ sycl::event search_axis1_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -4508,7 +4582,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4529,7 +4603,7 @@ sycl::event search_axis1_over_group_temps_contig_impl( idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); @@ -4657,15 +4731,16 @@ sycl::event search_axis0_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - constexpr size_t preferrered_reductions_per_wi = 8; + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU size_t max_wg = std::min(size_t(2048), d.get_info() / 2); - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requries 1 work-group, can output directly to res + size_t reductions_per_wi(preferred_reductions_per_wi); + if (reduction_nelems <= preferred_reductions_per_wi * max_wg) { + // Perform reduction using one 1 work-group per iteration, + // can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -4684,7 +4759,10 @@ sycl::event search_axis0_over_group_temps_contig_impl( 0, /* size */ static_cast(reduction_nelems), /* step */ static_cast(iter_nelems)}; - wg = max_wg; + if (iter_nelems == 1) { + // increase GPU occupancy + wg = max_wg; + } reductions_per_wi = std::max(1, (reduction_nelems + wg - 1) / wg); @@ -4736,13 +4814,13 @@ sycl::event search_axis0_over_group_temps_contig_impl( else { // more than one work-groups is needed, requires a temporary size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_nelems + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups > 1); size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + (reduction_groups + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); resTy *partially_reduced_tmp = sycl::malloc_device( iter_nelems * (reduction_groups + second_iter_reduction_groups_), @@ -4806,7 +4884,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4825,7 +4903,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( partially_reduced_tmp, ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } }); @@ -4840,11 +4918,10 @@ sycl::event search_axis0_over_group_temps_contig_impl( sycl::event dependent_ev = first_reduction_ev; while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); + preferred_reductions_per_wi * max_wg) { + size_t reduction_groups_ = (remaining_reduction_nelems + + preferred_reductions_per_wi * wg - 1) / + (preferred_reductions_per_wi * wg); assert(reduction_groups_ > 1); // keep reducing @@ -4889,7 +4966,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( ReductionOpT(), identity_val, IndexOpT(), idx_identity_val, in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + iter_nelems, preferred_reductions_per_wi)); } else { using SlmT = sycl::local_accessor; @@ -4910,7 +4987,7 @@ sycl::event search_axis0_over_group_temps_contig_impl( idx_identity_val, in_out_iter_indexer, reduction_indexer, local_memory, remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); + preferred_reductions_per_wi)); } }); From 6a0b09c1157d7c68f68ffc20738f601e29784dfd Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 2 Nov 2023 08:52:01 -0500 Subject: [PATCH 65/83] Apply SequentialReductionKernel to axis0 reduction --- .../libtensor/include/kernels/reductions.hpp | 71 ++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 7c40623048..36aad581a6 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -696,7 +696,41 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - { + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class reduction_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + else { sycl::event res_init_ev = exec_q.fill( res_tp, resTy(identity_val), iter_nelems, depends); @@ -1849,6 +1883,41 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class reduction_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU constexpr size_t max_max_wg = 2048; From f74eae0d86bbd1971661e9f022e948d4f9eb490b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 2 Nov 2023 14:05:20 -0500 Subject: [PATCH 66/83] Split _tensor_impl into three extensions _tensor_impl continues holding constructors, where, clip _tensor_elementwise_impl holds elementwise functions _tensor_reductions_impl holds reduction functions. --- dpctl/tensor/CMakeLists.txt | 68 +++++++++++++------ .../{tensor_py.cpp => tensor_ctors.cpp} | 8 +-- .../libtensor/source/tensor_elementwise.cpp | 34 ++++++++++ .../libtensor/source/tensor_reductions.cpp | 37 ++++++++++ 4 files changed, 120 insertions(+), 27 deletions(-) rename dpctl/tensor/libtensor/source/{tensor_py.cpp => tensor_ctors.cpp} (98%) create mode 100644 dpctl/tensor/libtensor/source/tensor_elementwise.cpp create mode 100644 dpctl/tensor/libtensor/source/tensor_reductions.cpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 9c02a325bc..d1de208805 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -113,10 +113,13 @@ set(_reduction_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/reduce_hypot.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reductions/sum.cpp ) +set(_boolean_reduction_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp +) set(_tensor_impl_sources - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_py.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_ctors.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/accumulators.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_and_cast_usm_to_usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/copy_for_reshape.cpp @@ -128,19 +131,39 @@ set(_tensor_impl_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/triul_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/where.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/clip.cpp ) -list(APPEND _tensor_impl_sources - ${_elementwise_sources} - ${_reduction_sources} +set(_tensor_elementwise_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_elementwise.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${_elementwise_sources} +) +set(_tensor_reductions_impl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/tensor_reductions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/simplify_iteration_space.cpp + ${_boolean_reduction_sources} + ${_reduction_sources} ) +set(_py_trgts) + set(python_module_name _tensor_impl) pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources}) add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_elementwise_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_elementwise_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_elementwise_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + +set(python_module_name _tensor_reductions_impl) +pybind11_add_module(${python_module_name} MODULE ${_tensor_reductions_impl_sources}) +add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_reductions_impl_sources}) +list(APPEND _py_trgts ${python_module_name}) + set(_clang_prefix "") if (WIN32) set(_clang_prefix "/clang:") @@ -170,19 +193,22 @@ if (UNIX) ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES") endif() -target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) -target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) -if(UNIX) - # this option is supported on Linux only - target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code) -endif() -target_include_directories(${python_module_name} - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/../include - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ -) + set(_linker_options "LINKER:${DPCTL_LDFLAGS}") -target_link_options(${python_module_name} PRIVATE ${_linker_options}) -add_dependencies(${python_module_name} _dpctl4pybind11_deps) -install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor") +foreach(python_module_name ${_py_trgts}) + target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int) + target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel) + if(UNIX) + # this option is supported on Linux only + target_link_options(${python_module_name} PRIVATE -fsycl-link-huge-device-code) + endif() + target_include_directories(${python_module_name} + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/include + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/ + ) + target_link_options(${python_module_name} PRIVATE ${_linker_options}) + add_dependencies(${python_module_name} _dpctl4pybind11_deps) + install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor") +endforeach() diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp similarity index 98% rename from dpctl/tensor/libtensor/source/tensor_py.cpp rename to dpctl/tensor/libtensor/source/tensor_ctors.cpp index d07d5cf084..8db874cbe1 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -1,4 +1,5 @@ -//===-- tensor_py.cpp - Implementation of _tensor_impl module --*-C++-*-/===// +//===-- tensor_ctors.cpp - ---*-C++-*-/===// +// Implementation of _tensor_impl module // // Data Parallel Control (dpctl) // @@ -43,7 +44,6 @@ #include "copy_for_roll.hpp" #include "copy_numpy_ndarray_into_usm_ndarray.hpp" #include "device_support_queries.hpp" -#include "elementwise_functions/elementwise_common.hpp" #include "eye_ctor.hpp" #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" @@ -454,8 +454,4 @@ PYBIND11_MODULE(_tensor_impl, m) "Returns a tuple of events: (hev, ev)", py::arg("src"), py::arg("min"), py::arg("max"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); - - dpctl::tensor::py_internal::init_elementwise_functions(m); - dpctl::tensor::py_internal::init_boolean_reduction_functions(m); - dpctl::tensor::py_internal::init_reduction_functions(m); } diff --git a/dpctl/tensor/libtensor/source/tensor_elementwise.cpp b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp new file mode 100644 index 0000000000..1a86526893 --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_elementwise.cpp @@ -0,0 +1,34 @@ +//===-- tensor_elementwise.cpp ---*-C++-*-/===// +// Implementation of _tensor_elementwise_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include "elementwise_functions/elementwise_common.hpp" +#include + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_elementwise_impl, m) +{ + dpctl::tensor::py_internal::init_elementwise_functions(m); +} diff --git a/dpctl/tensor/libtensor/source/tensor_reductions.cpp b/dpctl/tensor/libtensor/source/tensor_reductions.cpp new file mode 100644 index 0000000000..138c31f3eb --- /dev/null +++ b/dpctl/tensor/libtensor/source/tensor_reductions.cpp @@ -0,0 +1,37 @@ +//===-- tensor_reductions.cpp - --*-C++-*-/===// +// Implementation of _tensor_reductions_impl module +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===----------------------------------------------------------------------===// + +#include + +#include "boolean_reductions.hpp" +#include "reductions/reduction_common.hpp" + +namespace py = pybind11; + +PYBIND11_MODULE(_tensor_reductions_impl, m) +{ + dpctl::tensor::py_internal::init_boolean_reduction_functions(m); + dpctl::tensor::py_internal::init_reduction_functions(m); +} From 421b2703e3cad211e2715a9a3e9cbf616b1e28c3 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 2 Nov 2023 14:07:02 -0500 Subject: [PATCH 67/83] Used new native extension modules --- dpctl/tensor/_clip.py | 5 +++-- dpctl/tensor/_elementwise_funcs.py | 2 +- dpctl/tensor/_reduction.py | 25 +++++++++++++------------ dpctl/tensor/_utility_functions.py | 5 +++-- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/dpctl/tensor/_clip.py b/dpctl/tensor/_clip.py index 5a3a96933f..eeed87b404 100644 --- a/dpctl/tensor/_clip.py +++ b/dpctl/tensor/_clip.py @@ -16,6 +16,7 @@ import dpctl import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei import dpctl.tensor._tensor_impl as ti from dpctl.tensor._copy_utils import ( _empty_like_orderK, @@ -429,9 +430,9 @@ def clip(x, min=None, max=None, out=None, order="K"): "only one of `min` and `max` is permitted to be `None`" ) elif max is None: - return _clip_none(x, min, out, order, ti._maximum) + return _clip_none(x, min, out, order, tei._maximum) elif min is None: - return _clip_none(x, max, out, order, ti._minimum) + return _clip_none(x, max, out, order, tei._minimum) else: q1, x_usm_type = x.sycl_queue, x.usm_type q2, min_usm_type = _get_queue_usm_type(min) diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py index aa5ba04b19..9879960999 100644 --- a/dpctl/tensor/_elementwise_funcs.py +++ b/dpctl/tensor/_elementwise_funcs.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_elementwise_impl as ti from ._elementwise_common import BinaryElementwiseFunc, UnaryElementwiseFunc from ._type_utils import _acceptance_fn_divide diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 0edc9ac12b..0cd302cccc 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -19,6 +19,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri from ._type_utils import _to_device_supported_dtype @@ -220,8 +221,8 @@ def sum(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._sum_over_axis, - ti._sum_over_axis_dtype_supported, + tri._sum_over_axis, + tri._sum_over_axis_dtype_supported, _default_reduction_dtype, _identity=0, ) @@ -281,8 +282,8 @@ def prod(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._prod_over_axis, - ti._prod_over_axis_dtype_supported, + tri._prod_over_axis, + tri._prod_over_axis_dtype_supported, _default_reduction_dtype, _identity=1, ) @@ -335,8 +336,8 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._logsumexp_over_axis, - lambda inp_dt, res_dt, *_: ti._logsumexp_over_axis_dtype_supported( + tri._logsumexp_over_axis, + lambda inp_dt, res_dt, *_: tri._logsumexp_over_axis_dtype_supported( inp_dt, res_dt ), _default_reduction_dtype_fp_types, @@ -391,8 +392,8 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False): axis, dtype, keepdims, - ti._hypot_over_axis, - lambda inp_dt, res_dt, *_: ti._hypot_over_axis_dtype_supported( + tri._hypot_over_axis, + lambda inp_dt, res_dt, *_: tri._hypot_over_axis_dtype_supported( inp_dt, res_dt ), _default_reduction_dtype_fp_types, @@ -468,7 +469,7 @@ def max(x, axis=None, keepdims=False): entire array, a zero-dimensional array is returned. The returned array has the same data type as `x`. """ - return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis) + return _comparison_over_axis(x, axis, keepdims, tri._max_over_axis) def min(x, axis=None, keepdims=False): @@ -496,7 +497,7 @@ def min(x, axis=None, keepdims=False): entire array, a zero-dimensional array is returned. The returned array has the same data type as `x`. """ - return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis) + return _comparison_over_axis(x, axis, keepdims, tri._min_over_axis) def _search_over_axis(x, axis, keepdims, _reduction_fn): @@ -577,7 +578,7 @@ def argmax(x, axis=None, keepdims=False): zero-dimensional array is returned. The returned array has the default array index data type for the device of `x`. """ - return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis) + return _search_over_axis(x, axis, keepdims, tri._argmax_over_axis) def argmin(x, axis=None, keepdims=False): @@ -609,4 +610,4 @@ def argmin(x, axis=None, keepdims=False): zero-dimensional array is returned. The returned array has the default array index data type for the device of `x`. """ - return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis) + return _search_over_axis(x, axis, keepdims, tri._argmin_over_axis) diff --git a/dpctl/tensor/_utility_functions.py b/dpctl/tensor/_utility_functions.py index 500c997e8f..69a1a200df 100644 --- a/dpctl/tensor/_utility_functions.py +++ b/dpctl/tensor/_utility_functions.py @@ -3,6 +3,7 @@ import dpctl import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri def _boolean_reduction(x, axis, keepdims, func): @@ -94,7 +95,7 @@ def all(x, axis=None, keepdims=False): An array with a data type of `bool` containing the results of the logical AND reduction. """ - return _boolean_reduction(x, axis, keepdims, ti._all) + return _boolean_reduction(x, axis, keepdims, tri._all) def any(x, axis=None, keepdims=False): @@ -122,4 +123,4 @@ def any(x, axis=None, keepdims=False): An array with a data type of `bool` containing the results of the logical OR reduction. """ - return _boolean_reduction(x, axis, keepdims, ti._any) + return _boolean_reduction(x, axis, keepdims, tri._any) From 41ec378f44dedc6c093f4d3fb1361285cac91811 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Nov 2023 02:12:45 -0500 Subject: [PATCH 68/83] Added docstrings and getter methods for ElementwiseFunc classes Added stable API to retrieve implementation functions in each elementwise function class instance to allow `dpnp` to access that information using stable API. --- dpctl/tensor/_elementwise_common.py | 150 ++++++++++++++++++ .../elementwise/test_elementwise_classes.py | 80 ++++++++++ 2 files changed, 230 insertions(+) create mode 100644 dpctl/tests/elementwise/test_elementwise_classes.py diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index baaac078b5..fe2a77a9d5 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -39,6 +39,31 @@ class UnaryElementwiseFunc: """ Class that implements unary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovler_fn (callable): + Function that takes dtype of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + unary_dp_impl_fn (callable): + Data-parallel implementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, effectively + evaluating `dst = func(src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + docs (str): + Documentation string for the unary function. """ def __init__(self, name, result_type_resolver_fn, unary_dp_impl_fn, docs): @@ -55,8 +80,31 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + def get_implementation_function(self): + """Returns the implementation function for + this elementwise unary function. + + """ + return self.unary_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise unary function. + """ + return self.result_type_resolver_fn_ + @property def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.sin.types + # Outputs: ['e->e', 'f->f', 'd->d', 'F->F', 'D->D'] + """ types = self.types_ if not types: types = [] @@ -363,6 +411,56 @@ def _get_shape(o): class BinaryElementwiseFunc: """ Class that implements binary element-wise functions. + + Args: + name (str): + Name of the unary function + result_type_resovle_fn (callable): + Function that takes dtypes of the input and + returns the dtype of the result if the + implementation functions supports it, or + returns `None` otherwise. + binary_dp_impl_fn (callable): + Data-parallel umplementation function with signature + `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src1` and `src2` are the argument arrays, `dst` is the + array to be populated with function values, + i.e. `dst=func(src1, src2)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including lifetime management of argument Python objects to ensure + that their associated USM allocation is not freed before offloaded + computational tasks complete execution, while the second event + corresponds to computational tasks associated with function + evaluation. + docs (str): + Documentation string for the unary function. + binary_inplace_fn (callable, optional): + Data-parallel omplementation function with signature + `impl_fn(src: usm_ndarray, dst: usm_ndarray, + sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` + where the `src` is the argument array, `dst` is the + array to be populated with function values, + i.e. `dst=func(dst, src)`. + The `impl_fn` is expected to return a 2-tuple of `SyclEvent`s. + The first event corresponds to data-management host tasks, + including async lifetime management of Python arguments, + while the second event corresponds to computational tasks + associated with function evaluation. + acceptance_fn (callable, optional): + Function to influence type promotion behavior of this binary + function. The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - The :class:`dpctl.SyclDevice` where the function + evaluation is carried out. + The function is only called when both arguments of the binary + function require casting, e.g. both arguments of + `dpctl.tensor.logaddexp` are arrays with integral data type. """ def __init__( @@ -392,8 +490,60 @@ def __str__(self): def __repr__(self): return f"<{self.__name__} '{self.name_}'>" + def get_implementation_function(self): + """Returns the out-of-place implementation + function for this elementwise binary function. + + """ + return self.binary_fn_ + + def get_implementation_inplace_function(self): + """Returns the in-place implementation + function for this elementwise binary function. + + """ + return self.binary_inplace_fn_ + + def get_type_result_resolver_function(self): + """Returns the type resolver function for this + elementwise binary function. + """ + return self.result_type_resolver_fn_ + + def get_type_promotion_path_acceptance_function(self): + """Returns the acceptance function for this + elementwise binary function. + + Acceptance function influences the type promotion + behavior of this binary function. + The function takes 6 arguments: + arg1_dtype - Data type of the first argument + arg2_dtype - Data type of the second argument + ret_buf1_dtype - Data type the first argument would be cast to + ret_buf2_dtype - Data type the second argument would be cast to + res_dtype - Data type of the output array with function values + sycl_dev - :class:`dpctl.SyclDevice` on which function evaluation + is carried out. + + The acceptance function is only invoked if both input arrays must be + cast to intermediary data types, as would happen during call of + `dpctl.tensor.hypot` with both arrays being of integral data type. + """ + return self.acceptance_fn_ + @property def types(self): + """Returns information about types supported by + implementation function, using NumPy's character + encoding for data types, e.g. + + :Example: + .. code-block:: python + + dpctl.tensor.divide.types + # Outputs: ['ee->e', 'ff->f', 'fF->F', 'dd->d', 'dD->D', + # 'Ff->F', 'FF->F', 'Dd->D', 'DD->D'] + """ types = self.types_ if not types: types = [] diff --git a/dpctl/tests/elementwise/test_elementwise_classes.py b/dpctl/tests/elementwise/test_elementwise_classes.py new file mode 100644 index 0000000000..b7f1d26d6e --- /dev/null +++ b/dpctl/tests/elementwise/test_elementwise_classes.py @@ -0,0 +1,80 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl.tensor as dpt + +unary_fn = dpt.negative +binary_fn = dpt.divide + + +def test_unary_class_getters(): + fn = unary_fn.get_implementation_function() + assert callable(fn) + + fn = unary_fn.get_type_result_resolver_function() + assert callable(fn) + + +def test_unary_class_types_property(): + loop_types = unary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_unary_class_str_repr(): + s = str(unary_fn) + r = repr(unary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = unary_fn.__name__ + assert kl_n in s + assert kl_n in r + + +def test_binary_class_getters(): + fn = binary_fn.get_implementation_function() + assert callable(fn) + + fn = binary_fn.get_implementation_inplace_function() + assert callable(fn) + + fn = binary_fn.get_type_result_resolver_function() + assert callable(fn) + + fn = binary_fn.get_type_promotion_path_acceptance_function() + assert callable(fn) + + +def test_binary_class_types_property(): + loop_types = binary_fn.types + assert isinstance(loop_types, list) + assert len(loop_types) > 0 + assert all(isinstance(sig, str) for sig in loop_types) + assert all("->" in sig for sig in loop_types) + + +def test_binary_class_str_repr(): + s = str(binary_fn) + r = repr(binary_fn) + + assert isinstance(s, str) + assert isinstance(r, str) + kl_n = binary_fn.__name__ + assert kl_n in s + assert kl_n in r From 645044a6b6e7f4d5b9be8535bff4d612335d4e74 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Nov 2023 05:50:20 -0500 Subject: [PATCH 69/83] Instantiate atomic reduction templates for min/max ops for double/float types Added entries for float and double types to TypePairSupportDataForCompReductionAtomic as spotted by @ndgrigorian in the PR review. Also moved comments around. --- .../libtensor/include/kernels/reductions.hpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 36aad581a6..f3754e8820 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -2247,11 +2247,10 @@ template struct TypePairSupportDataForCompReductionAtomic { - /* value if true a kernel for must be instantiated, false + /* value is true if a kernel for must be instantiated, false * otherwise */ - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ + // disjunction is C++17 feature, supported by DPC++ + static constexpr bool is_defined = std::disjunction< // input int32 td_ns::TypePairDefinedEntry, // input uint32 @@ -2260,6 +2259,10 @@ struct TypePairSupportDataForCompReductionAtomic td_ns::TypePairDefinedEntry, // input uint64 td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, // fall-through td_ns::NotDefinedEntry>::is_defined; }; @@ -2268,19 +2271,17 @@ template struct TypePairSupportDataForCompReductionTemps { - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool + // disjunction is C++17 feature, supported by DPC++ + static constexpr bool is_defined = std::disjunction< + // input bool td_ns::TypePairDefinedEntry, // input int8_t td_ns::TypePairDefinedEntry, - // input uint8_t td_ns::TypePairDefinedEntry, // input int16_t td_ns::TypePairDefinedEntry, - // input uint16_t td_ns::TypePairDefinedEntry, From 097ecf5c6af6264b56aa7b8465e4775906d0bebd Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Nov 2023 05:54:16 -0500 Subject: [PATCH 70/83] Modified sycl_timer example to use dpctl.tensor function This removes use of dpnp.matmul from the example, making this example self-contained. --- examples/python/sycl_timer.py | 41 +++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/examples/python/sycl_timer.py b/examples/python/sycl_timer.py index f4b1416784..8ae49fd60d 100644 --- a/examples/python/sycl_timer.py +++ b/examples/python/sycl_timer.py @@ -15,14 +15,27 @@ # limitations under the License. -import dpnp import numpy as np import dpctl import dpctl.tensor as dpt from dpctl import SyclTimer -n = 4000 + +def matmul(m1, m2): + """Naive matrix multiplication implementation""" + assert m1.ndim == 2 + assert m2.ndim == 2 + assert m1.shape[1] == m2.shape[0] + m1 = m1[:, dpt.newaxis, :] + m2 = dpt.permute_dims(m2, (1, 0))[dpt.newaxis, :, :] + # form m_prod[i, j, k] = m1[i,k] * m2[k, j] + m_prods = m1 * m2 + # sum over k + return dpt.sum(m_prods, axis=-1) + + +n = 500 try: q = dpctl.SyclQueue(property="enable_profiling") @@ -33,32 +46,36 @@ ) exit(0) -a = dpt.reshape(dpt.arange(n * n, dtype=np.float32, sycl_queue=q), (n, n)) -b = dpt.reshape( - dpt.asarray(np.random.random(n * n), dtype=np.float32, sycl_queue=q), (n, n) -) +a_flat = dpt.arange(n * n, dtype=dpt.float32, sycl_queue=q) +a = dpt.reshape(a_flat, (n, n)) -timer = SyclTimer(time_scale=1) +b_rand = np.random.random(n * n).astype(np.float32) +b_flat = dpt.asarray(b_rand, dtype=dpt.float32, sycl_queue=q) +b = dpt.reshape(b_flat, (n, n)) wall_times = [] device_times = [] + print( - f"Performing matrix multiplication of two {n} by {n} matrices " + f"Computing naive matrix multiplication of two {n} by {n} matrices " f"on {q.sycl_device.name}, repeating 5 times." ) +print() for _ in range(5): + timer = SyclTimer(time_scale=1) with timer(q): - a_matmul_b = dpnp.matmul(a, b) + a_matmul_b = matmul(a, b) host_time, device_time = timer.dt wall_times.append(host_time) device_times.append(device_time) -c = dpnp.asnumpy(a_matmul_b) -cc = np.dot(dpnp.asnumpy(a), dpnp.asnumpy(b)) +c = dpt.asnumpy(a_matmul_b) +cc = np.dot(dpt.asnumpy(a), dpt.asnumpy(b)) print("Wall time: ", wall_times, "\nDevice time: ", device_times) +print() print( "Accuracy test: passed." if np.allclose(c, cc) - else (f"Accuracy test: failed. Discrepancy {np.max(np.abs(c-cc))}") + else (f"Accuracy test: FAILED. \n Discrepancy = {np.max(np.abs(c-cc))}") ) From af28d981d513729a31349fe19cb1da10515ab849 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Fri, 3 Nov 2023 13:19:41 -0700 Subject: [PATCH 71/83] Improves performance of search reductions for small numbers of elements (#1464) * Adds SequentialSearchReduction functor to search reductions * Search reductions use correct branch for float16 constexpr branch logic accounted for floating point types but not sycl::half, which meant NaNs were not propagating for float16 data --- .../libtensor/include/kernels/reductions.hpp | 251 +++++++++++++++++- 1 file changed, 248 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index f3754e8820..6651483c6c 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -3401,6 +3401,129 @@ struct LogSumExpOverAxis0TempsContigFactory // Argmax and Argmin +/* Sequential search reduction */ + +template +struct SequentialSearchReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + +public: + SequentialSearchReduction(const argT *inp, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), idx_reduction_op_(idx_reduction_op), + idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const py::ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const py::ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + argT red_val(identity_); + outT idx_val(idx_identity_); + for (size_t m = 0; m < reduction_max_gid_; ++m) { + const py::ssize_t inp_reduction_offset = + inp_reduced_dims_indexer_(m); + const py::ssize_t inp_offset = + inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if (val == red_val) { + idx_val = idx_reduction_op_(idx_val, static_cast(m)); + } + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so check + if (less_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val < red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val < red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + red_val = val; + idx_val = static_cast(m); + } + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { + if (val > red_val || std::isnan(val)) { + red_val = val; + idx_val = static_cast(m); + } + } + else { + if (val > red_val) { + red_val = val; + idx_val = static_cast(m); + } + } + } + } + } + out_[out_iter_offset] = idx_val; + } +}; + /* = Search reduction using reduce_over_group*/ template ) { + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { if (val < local_red_val || std::isnan(val)) { local_red_val = val; if constexpr (!First) { @@ -3714,7 +3839,9 @@ struct CustomSearchReduction } } } - else if constexpr (std::is_floating_point_v) { + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { if (val > local_red_val || std::isnan(val)) { local_red_val = val; if constexpr (!First) { @@ -3757,7 +3884,9 @@ struct CustomSearchReduction ? local_idx : idx_identity_; } - else if constexpr (std::is_floating_point_v) { + else if constexpr (std::is_floating_point_v || + std::is_same_v) + { // equality does not hold for NaNs, so check here local_idx = (red_val_over_wg == local_red_val || std::isnan(local_red_val)) @@ -3799,6 +3928,14 @@ typedef sycl::event (*search_strided_impl_fn_ptr)( py::ssize_t, const std::vector &); +template +class search_seq_strided_krn; + template class custom_search_over_group_temps_strided_krn; +template +class search_seq_contig_krn; + template (); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + constexpr size_t preferred_reductions_per_wi = 4; // max_max_wg prevents running out of resources on CPU size_t max_wg = @@ -4419,6 +4594,39 @@ sycl::event search_axis1_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU size_t max_wg = @@ -4801,6 +5009,43 @@ sycl::event search_axis0_over_group_temps_contig_impl( const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{ + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; + + using KernelName = + class search_seq_contig_krn; + + sycl::range<1> iter_range{iter_nelems}; + + cgh.parallel_for( + iter_range, + SequentialSearchReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems)); + }); + + return comp_ev; + } + constexpr size_t preferred_reductions_per_wi = 8; // max_max_wg prevents running out of resources on CPU size_t max_wg = From eb21e50cc1ee29cf7c7e6e442d0075c3425b16e9 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Nov 2023 11:01:43 -0500 Subject: [PATCH 72/83] Remove superfluous includes in tensor_ctors.cpp per PR review --- dpctl/tensor/libtensor/source/tensor_ctors.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp index 8db874cbe1..4720f6baa1 100644 --- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -37,7 +37,6 @@ #include "accumulators.hpp" #include "boolean_advanced_indexing.hpp" -#include "boolean_reductions.hpp" #include "clip.hpp" #include "copy_and_cast_usm_to_usm.hpp" #include "copy_for_reshape.hpp" @@ -48,7 +47,6 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" -#include "reductions/reduction_common.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "triul_ctor.hpp" From d4d499267ef3cc7a2d0c73f65d287efc589f8dfb Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 3 Nov 2023 11:03:17 -0500 Subject: [PATCH 73/83] Fixed misspelled words --- dpctl/tensor/_elementwise_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py index fe2a77a9d5..4a0d1c451f 100644 --- a/dpctl/tensor/_elementwise_common.py +++ b/dpctl/tensor/_elementwise_common.py @@ -421,7 +421,7 @@ class BinaryElementwiseFunc: implementation functions supports it, or returns `None` otherwise. binary_dp_impl_fn (callable): - Data-parallel umplementation function with signature + Data-parallel implementation function with signature `impl_fn(src1: usm_ndarray, src2: usm_ndarray, dst: usm_ndarray, sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` where the `src1` and `src2` are the argument arrays, `dst` is the @@ -437,7 +437,7 @@ class BinaryElementwiseFunc: docs (str): Documentation string for the unary function. binary_inplace_fn (callable, optional): - Data-parallel omplementation function with signature + Data-parallel implementation function with signature `impl_fn(src: usm_ndarray, dst: usm_ndarray, sycl_queue: SyclQueue, depends: Optional[List[SyclEvent]])` where the `src` is the argument array, `dst` is the From 7e790830cd04bdf7c9503d6861a5857d9e626650 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 5 Nov 2023 16:49:46 -0600 Subject: [PATCH 74/83] Enable use of np.int64 to specify shape of usm_ndarray --- dpctl/tensor/_usmarray.pyx | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index ba18600135..94c3dc7d7c 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -182,13 +182,19 @@ cdef class usm_ndarray: cdef bint is_fp16 = False self._reset() - if (not isinstance(shape, (list, tuple)) - and not hasattr(shape, 'tolist')): - try: - shape - shape = [shape, ] - except Exception: - raise TypeError("Argument shape must be a list or a tuple.") + if not isinstance(shape, (list, tuple)): + if hasattr(shape, 'tolist'): + fn = getattr(shape, 'tolist') + if callable(fn): + shape = shape.tolist() + if not isinstance(shape, (list, tuple)): + try: + shape + shape = [shape, ] + except Exception: + raise TypeError( + "Argument shape must be a list or a tuple." + ) nd = len(shape) if dtype is None: if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)): From 2bc793923ddaaadc6bc8e05a72710943137a8432 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 5 Nov 2023 16:50:33 -0600 Subject: [PATCH 75/83] Add a test for shape being np.int64 scalar --- dpctl/tests/test_usm_ndarray_ctor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/tests/test_usm_ndarray_ctor.py b/dpctl/tests/test_usm_ndarray_ctor.py index 72f5aabebb..095bbc5638 100644 --- a/dpctl/tests/test_usm_ndarray_ctor.py +++ b/dpctl/tests/test_usm_ndarray_ctor.py @@ -39,6 +39,7 @@ (2, 5, 2), (2, 2, 2, 2, 2, 2, 2, 2), 5, + np.int32(7), ], ) @pytest.mark.parametrize("usm_type", ["shared", "host", "device"]) From aadb6b4aea27f3d2cb0144520076336af61b3cb2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 5 Nov 2023 16:50:52 -0600 Subject: [PATCH 76/83] Eliminated multiple uses of same literal constants in test_search_reduction_kernels --- dpctl/tests/test_usm_ndarray_reductions.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index cbfd6baec6..0969822e6d 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -175,9 +175,11 @@ def test_search_reduction_kernels(arg_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(arg_dtype, q) - x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q) + x_shape = (24, 1024) + x_size = np.prod(x_shape) + x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q) idx = randrange(x.size) - idx_tup = np.unravel_index(idx, (24, 1025)) + idx_tup = np.unravel_index(idx, x_shape) x[idx] = 2 m = dpt.argmax(x) @@ -194,7 +196,7 @@ def test_search_reduction_kernels(arg_dtype): m = dpt.argmax(y) assert m == 2 * idx - x = dpt.reshape(x, (24, 1025)) + x = dpt.reshape(x, x_shape) x[idx_tup[0], :] = 3 m = dpt.argmax(x, axis=0) @@ -209,15 +211,15 @@ def test_search_reduction_kernels(arg_dtype): m = dpt.argmax(x, axis=1) assert dpt.all(m == idx) - x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q) + x = dpt.ones(x_size, dtype=arg_dtype, sycl_queue=q) idx = randrange(x.size) - idx_tup = np.unravel_index(idx, (24, 1025)) + idx_tup = np.unravel_index(idx, x_shape) x[idx] = 0 m = dpt.argmin(x) assert m == idx - x = dpt.reshape(x, (24, 1025)) + x = dpt.reshape(x, x_shape) x[idx_tup[0], :] = -1 m = dpt.argmin(x, axis=0) From 07c075bdb1e087ea15e14ffa11846e49679c45c6 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 7 Nov 2023 10:31:41 -0800 Subject: [PATCH 77/83] Corrected argmin/argmax docstring Removed mention of dtype kwarg in usage line --- dpctl/tensor/_reduction.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 0cd302cccc..9b078211fc 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -445,7 +445,7 @@ def _comparison_over_axis(x, axis, keepdims, _reduction_fn): def max(x, axis=None, keepdims=False): - """max(x, axis=None, dtype=None, keepdims=False) + """max(x, axis=None, keepdims=False) Calculates the maximum value of the input array `x`. @@ -473,7 +473,7 @@ def max(x, axis=None, keepdims=False): def min(x, axis=None, keepdims=False): - """min(x, axis=None, dtype=None, keepdims=False) + """min(x, axis=None, keepdims=False) Calculates the minimum value of the input array `x`. @@ -550,7 +550,7 @@ def _search_over_axis(x, axis, keepdims, _reduction_fn): def argmax(x, axis=None, keepdims=False): - """argmax(x, axis=None, dtype=None, keepdims=False) + """argmax(x, axis=None, keepdims=False) Returns the indices of the maximum values of the input array `x` along a specified axis. @@ -582,7 +582,7 @@ def argmax(x, axis=None, keepdims=False): def argmin(x, axis=None, keepdims=False): - """argmin(x, axis=None, dtype=None, keepdims=False) + """argmin(x, axis=None, keepdims=False) Returns the indices of the minimum values of the input array `x` along a specified axis. From 80e2f29e4dfbb4f7a9a3c198f0d5314b132d1044 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 7 Nov 2023 10:47:31 -0800 Subject: [PATCH 78/83] Fixed gh-1468 Function _reduce_over_axis promotes input array to requested result data type and carries out reduction computation in that data type. This is done in dtype if implementation supports it. If implementation does not support the requested dtype, we reduce in the default_dtype, and cast to the request dtype afterwards. --- dpctl/tensor/_reduction.py | 62 ++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 9b078211fc..79ce231901 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -118,7 +118,7 @@ def _reduction_over_axis( dpt.full( res_shape, _identity, - dtype=_default_reduction_type_fn(inp_dt, q), + dtype=dtype, usm_type=res_usm_type, sycl_queue=q, ), @@ -142,21 +142,51 @@ def _reduction_over_axis( "Automatically determined reduction data type does not " "have direct implementation" ) - tmp_dt = _default_reduction_type_fn(inp_dt, q) - tmp = dpt.empty( - res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q - ) - ht_e_tmp, r_e = _reduction_fn( - src=arr2, trailing_dims_to_reduce=red_nd, dst=tmp, sycl_queue=q - ) - host_tasks_list.append(ht_e_tmp) - res = dpt.empty( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q - ) - ht_e, _ = ti._copy_usm_ndarray_into_usm_ndarray( - src=tmp, dst=res, sycl_queue=q, depends=[r_e] - ) - host_tasks_list.append(ht_e) + if _dtype_supported(res_dt, res_dt, res_usm_type, q): + tmp = dpt.empty( + arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, _ = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=q, + depends=[cpy_e], + ) + host_tasks_list.append(ht_e_red) + else: + buf_dt = _default_reduction_type_fn(inp_dt, q) + tmp = dpt.empty( + arr2.shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q + ) + tmp_res = dpt.empty( + res_shape, dtype=buf_dt, usm_type=res_usm_type, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = _reduction_fn( + src=tmp, + trailing_dims_to_reduce=red_nd, + dst=tmp_res, + sycl_queue=q, + depends=[cpy_e], + ) + ht_e_cpy2, _ = ti._copy_usm_ndarray_into_usm_ndarray( + src=tmp_res, dst=res, sycl_queue=q, depends=[r_e] + ) + host_tasks_list.append(ht_e_cpy2) if keepdims: res_shape = res_shape + (1,) * red_nd From ff9b5ebac31c874b91ef8b834907a3145a9c1c49 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 7 Nov 2023 11:06:16 -0600 Subject: [PATCH 79/83] Added a test based on gh-1468 --- dpctl/tests/test_tensor_sum.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index 749ca055b9..33fe4a8b4f 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -329,3 +329,12 @@ def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): assert isinstance(r, dpt.usm_ndarray) assert r.dtype == dpt.dtype(out_dtype) assert dpt.all(r == 1) + + +def test_gh_1468(): + "See https://github.com/IntelPython/dpctl/issues/1468" + get_queue_or_skip() + + a = dpt.full((2, 3, 4), 123456789, dtype=dpt.int32) + t = dpt.sum(a, dtype="f4") + assert t > 0 From da594763acb042330e723d160258dabc570ff964 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 7 Nov 2023 15:32:57 -0600 Subject: [PATCH 80/83] Changed TypeError wording per PR feedback --- dpctl/tensor/_usmarray.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dpctl/tensor/_usmarray.pyx b/dpctl/tensor/_usmarray.pyx index 94c3dc7d7c..5b394d971b 100644 --- a/dpctl/tensor/_usmarray.pyx +++ b/dpctl/tensor/_usmarray.pyx @@ -191,10 +191,11 @@ cdef class usm_ndarray: try: shape shape = [shape, ] - except Exception: + except Exception as e: raise TypeError( - "Argument shape must be a list or a tuple." - ) + "Argument shape must a non-negative integer, " + "or a list/tuple of such integers." + ) from e nd = len(shape) if dtype is None: if isinstance(buffer, (dpmem._memory._Memory, usm_ndarray)): From ca2c6aa9f9a0df34471c93c7b80fdec950576f80 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Tue, 7 Nov 2023 17:10:21 -0600 Subject: [PATCH 81/83] Removed redundant asdtype function call --- dpctl/tensor/_reduction.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 79ce231901..f797d24b0b 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -114,15 +114,12 @@ def _reduction_over_axis( res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) res_shape = tuple(res_shape[i] for i in inv_perm) - return dpt.astype( - dpt.full( - res_shape, - _identity, - dtype=dtype, - usm_type=res_usm_type, - sycl_queue=q, - ), - res_dt, + return dpt.full( + res_shape, + _identity, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, ) if red_nd == 0: return dpt.astype(x, res_dt, copy=False) From f2af7537692879171cf3442e100935c45c9a9eb0 Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Wed, 8 Nov 2023 08:38:05 -0800 Subject: [PATCH 82/83] Add array API inspection utilities to ``dpctl.tensor`` (#1469) * Adds __array_namespace_info__ inspection utility This inspection utility is coming to the array API specification in the near future * Set __array_api_version__ to "2022.12" * Remove --ci from array API conformity workflow * Adds __array_namespace_info__ docstrings Disallows dtypes for `kind` kwarg in __array_namespace_info__().dtypes Removes `float16` from dtypes listed by __array_namespace_info__ as per spec Permits dpctl.tensor.Device objects in device keyword arguments in array API inspection utilities * Adds tests for array API inspection --- .github/workflows/conda-package.yml | 2 +- dpctl/tensor/__init__.py | 3 + dpctl/tensor/_array_api.py | 207 ++++++++++++++++++ .../tests/test_tensor_array_api_inspection.py | 163 ++++++++++++++ 4 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 dpctl/tensor/_array_api.py create mode 100644 dpctl/tests/test_tensor_array_api_inspection.py diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index b09be78b08..b6a4649a30 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -666,7 +666,7 @@ jobs: python -c "import dpctl; dpctl.lsplatform()" export ARRAY_API_TESTS_MODULE=dpctl.tensor cd /home/runner/work/array-api-tests - pytest --ci --json-report --json-report-file=$FILE array_api_tests/ || true + pytest --json-report --json-report-file=$FILE array_api_tests/ || true - name: Set Github environment variables shell: bash -l {0} run: | diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 5eee3e9ab9..8638fc6d29 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -93,6 +93,7 @@ from dpctl.tensor._usmarray import usm_ndarray from dpctl.tensor._utility_functions import all, any +from ._array_api import __array_api_version__, __array_namespace_info__ from ._clip import clip from ._constants import e, inf, nan, newaxis, pi from ._elementwise_funcs import ( @@ -335,4 +336,6 @@ "clip", "logsumexp", "reduce_hypot", + "__array_api_version__", + "__array_namespace_info__", ] diff --git a/dpctl/tensor/_array_api.py b/dpctl/tensor/_array_api.py new file mode 100644 index 0000000000..613d6dcd66 --- /dev/null +++ b/dpctl/tensor/_array_api.py @@ -0,0 +1,207 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import ( + default_device_complex_type, + default_device_fp_type, + default_device_index_type, + default_device_int_type, +) + + +def _isdtype_impl(dtype, kind): + if isinstance(kind, str): + if kind == "bool": + return dtype.kind == "b" + elif kind == "signed integer": + return dtype.kind == "i" + elif kind == "unsigned integer": + return dtype.kind == "u" + elif kind == "integral": + return dtype.kind in "iu" + elif kind == "real floating": + return dtype.kind == "f" + elif kind == "complex floating": + return dtype.kind == "c" + elif kind == "numeric": + return dtype.kind in "iufc" + else: + raise ValueError(f"Unrecognized data type kind: {kind}") + + elif isinstance(kind, tuple): + return any(_isdtype_impl(dtype, k) for k in kind) + else: + raise TypeError(f"Unsupported data type kind: {kind}") + + +__array_api_version__ = "2022.12" + + +class Info: + """ + namespace returned by `__array_namespace_info__()` + """ + + def __init__(self): + self._capabilities = { + "boolean_indexing": True, + "data_dependent_shapes": True, + } + self._all_dtypes = { + "bool": dpt.bool, + "float32": dpt.float32, + "float64": dpt.float64, + "complex64": dpt.complex64, + "complex128": dpt.complex128, + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + "uint8": dpt.uint8, + "uint16": dpt.uint16, + "uint32": dpt.uint32, + "uint64": dpt.uint64, + } + + def capabilities(self): + """ + Returns a dictionary of `dpctl`'s capabilities. + + Returns: + dict: + dictionary of `dpctl`'s capabilities + - `boolean_indexing`: bool + - `data_dependent_shapes`: bool + """ + return self._capabilities.copy() + + def default_device(self): + """ + Returns the default SYCL device. + """ + return dpctl.select_default_device() + + def default_dtypes(self, device=None): + """ + Returns a dictionary of default data types for `device`. + + Args: + device (Optional[dpctl.SyclDevice, dpctl.SyclQueue, + dpctl.tensor.Device]): + array API concept of device used in getting default data types. + `device` can be `None` (in which case the default device is + used), an instance of :class:`dpctl.SyclDevice` corresponding + to a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a `Device` object returned by + :attr:`dpctl.tensor.usm_array.device`. Default: `None`. + + Returns: + dict: + a dictionary of default data types for `device` + - `real floating`: dtype + - `complex floating`: dtype + - `integral`: dtype + - `indexing`: dtype + """ + if device is None: + device = dpctl.select_default_device() + elif isinstance(device, dpt.Device): + device = device.sycl_device + return { + "real floating": dpt.dtype(default_device_fp_type(device)), + "complex floating": dpt.dtype(default_device_complex_type(device)), + "integral": dpt.dtype(default_device_int_type(device)), + "indexing": dpt.dtype(default_device_index_type(device)), + } + + def dtypes(self, device=None, kind=None): + """ + Returns a dictionary of all Array API data types of a specified `kind` + supported by `device` + + This dictionary only includes data types supported by the array API. + + See [array API](array_api). + + [array_api]: https://data-apis.org/array-api/latest/ + + Args: + device (Optional[dpctl.SyclDevice, dpctl.SyclQueue, + dpctl.tensor.Device, str]): + array API concept of device used in getting default data types. + `device` can be `None` (in which case the default device is + used), an instance of :class:`dpctl.SyclDevice` corresponding + to a non-partitioned SYCL device, an instance of + :class:`dpctl.SyclQueue`, or a `Device` object returned by + :attr:`dpctl.tensor.usm_array.device`. Default: `None`. + + kind (Optional[str, Tuple[str, ...]]): + data type kind. + - if `kind` is `None`, returns a dictionary of all data types + supported by `device` + - if `kind` is a string, returns a dictionary containing the + data types belonging to the data type kind specified. + Supports: + - "bool" + - "signed integer" + - "unsigned integer" + - "integral" + - "real floating" + - "complex floating" + - "numeric" + - if `kind` is a tuple, the tuple represents a union of `kind` + strings, and returns a dictionary containing data types + corresponding to the-specified union. + Default: `None`. + + Returns: + dict: + a dictionary of the supported data types of the specified `kind` + """ + if device is None: + device = dpctl.select_default_device() + elif isinstance(device, dpt.Device): + device = device.sycl_device + _fp64 = device.has_aspect_fp64 + if kind is None: + return { + key: val + for key, val in self._all_dtypes.items() + if (key != "float64" or _fp64) + } + else: + return { + key: val + for key, val in self._all_dtypes.items() + if (key != "float64" or _fp64) and _isdtype_impl(val, kind) + } + + def devices(self): + """ + Returns a list of supported devices. + """ + return dpctl.get_devices() + + +def __array_namespace_info__(): + """__array_namespace_info__() + + Returns a namespace with Array API namespace inspection utilities. + + """ + return Info() diff --git a/dpctl/tests/test_tensor_array_api_inspection.py b/dpctl/tests/test_tensor_array_api_inspection.py new file mode 100644 index 0000000000..5ae0d35f8e --- /dev/null +++ b/dpctl/tests/test_tensor_array_api_inspection.py @@ -0,0 +1,163 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import ( + default_device_complex_type, + default_device_fp_type, + default_device_index_type, + default_device_int_type, +) + +_dtypes_no_fp16_fp64 = { + "bool": dpt.bool, + "float32": dpt.float32, + "complex64": dpt.complex64, + "complex128": dpt.complex128, + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + "uint8": dpt.uint8, + "uint16": dpt.uint16, + "uint32": dpt.uint32, + "uint64": dpt.uint64, +} + + +class MockDevice: + def __init__(self, fp16: bool, fp64: bool): + self.has_aspect_fp16 = fp16 + self.has_aspect_fp64 = fp64 + + +def test_array_api_inspection_methods(): + info = dpt.__array_namespace_info__() + assert info.capabilities() + assert info.default_device() + assert info.default_dtypes() + assert info.devices() + assert info.dtypes() + + +def test_array_api_inspection_default_device(): + assert ( + dpt.__array_namespace_info__().default_device() + == dpctl.select_default_device() + ) + + +def test_array_api_inspection_devices(): + devices1 = dpt.__array_namespace_info__().devices() + devices2 = dpctl.get_devices() + assert len(devices1) == len(devices2) + assert devices1 == devices2 + + +def test_array_api_inspection_capabilities(): + capabilities = dpt.__array_namespace_info__().capabilities() + assert capabilities["boolean_indexing"] + assert capabilities["data_dependent_shapes"] + + +def test_array_api_inspection_default_dtypes(): + dev = dpctl.select_default_device() + + int_dt = default_device_int_type(dev) + ind_dt = default_device_index_type(dev) + fp_dt = default_device_fp_type(dev) + cm_dt = default_device_complex_type(dev) + + info = dpt.__array_namespace_info__() + default_dts_nodev = info.default_dtypes() + default_dts_dev = info.default_dtypes(dev) + + assert ( + int_dt == default_dts_nodev["integral"] == default_dts_dev["integral"] + ) + assert ( + ind_dt == default_dts_nodev["indexing"] == default_dts_dev["indexing"] + ) + assert ( + fp_dt + == default_dts_nodev["real floating"] + == default_dts_dev["real floating"] + ) + assert ( + cm_dt + == default_dts_nodev["complex floating"] + == default_dts_dev["complex floating"] + ) + + +def test_array_api_inspection_default_device_dtypes(): + dev = dpctl.select_default_device() + dtypes = _dtypes_no_fp16_fp64.copy() + if dev.has_aspect_fp64: + dtypes["float64"] = dpt.float64 + + assert dtypes == dpt.__array_namespace_info__().dtypes() + + +@pytest.mark.parametrize("fp16", [True, False]) +@pytest.mark.parametrize("fp64", [True, False]) +def test_array_api_inspection_device_dtypes(fp16, fp64): + dev = MockDevice(fp16, fp64) + dtypes = _dtypes_no_fp16_fp64.copy() + if fp64: + dtypes["float64"] = dpt.float64 + + assert dtypes == dpt.__array_namespace_info__().dtypes(device=dev) + + +def test_array_api_inspection_dtype_kind(): + info = dpt.__array_namespace_info__() + + f_dtypes = info.dtypes(kind="real floating") + assert all([_dt[1].kind == "f" for _dt in f_dtypes.items()]) + + i_dtypes = info.dtypes(kind="signed integer") + assert all([_dt[1].kind == "i" for _dt in i_dtypes.items()]) + + u_dtypes = info.dtypes(kind="unsigned integer") + assert all([_dt[1].kind == "u" for _dt in u_dtypes.items()]) + + ui_dtypes = info.dtypes(kind="unsigned integer") + assert all([_dt[1].kind in "ui" for _dt in ui_dtypes.items()]) + + c_dtypes = info.dtypes(kind="complex floating") + assert all([_dt[1].kind == "c" for _dt in c_dtypes.items()]) + + assert info.dtypes(kind="bool") == {"bool": dpt.bool} + + _signed_ints = { + "int8": dpt.int8, + "int16": dpt.int16, + "int32": dpt.int32, + "int64": dpt.int64, + } + assert ( + info.dtypes(kind=("signed integer", "signed integer")) == _signed_ints + ) + assert ( + info.dtypes( + kind=("integral", "bool", "real floating", "complex floating") + ) + == info.dtypes() + ) From 9afb74268a919ca0f0fcb2ca3d0b1df95d82b57e Mon Sep 17 00:00:00 2001 From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com> Date: Wed, 8 Nov 2023 13:31:59 -0800 Subject: [PATCH 83/83] Implements statistical functions ``mean``, ``std``, ``var`` (#1465) * Resolves gh-1456 Tree reductions now populate destination with the identity when reducing over zero-size axes. As a result, logic was removed for handling zero-size axes. ``argmax``, ``argmin``, ``max``, and ``min`` still raise an error for zero-size axes. Reductions now return a copy when provided an empty axis tuple. Adds additional supported dtype combinations to ``prod`` and ``sum``, specifically for input integers and inexact output type * Implements mean, var, and std * Adds more tests for statistical functions * Adds docstrings for statistical functions * Adds more supported types to arithmetic reductions Permits `float` accumulation type with 64 bit integer and unsigned integer inouts to prevent unnecessary copies on devices that don't support double precision * Changes mean reduction to use output data type as sum accumulation type Mean in-place division now uses the real type for the denominator --- dpctl/tensor/__init__.py | 4 + dpctl/tensor/_reduction.py | 48 +-- dpctl/tensor/_statistical_functions.py | 381 ++++++++++++++++++ .../libtensor/include/kernels/reductions.hpp | 126 +++++- .../source/reductions/reduction_over_axis.hpp | 12 + .../test_tensor_statistical_functions.py | 254 ++++++++++++ 6 files changed, 797 insertions(+), 28 deletions(-) create mode 100644 dpctl/tensor/_statistical_functions.py create mode 100644 dpctl/tests/test_tensor_statistical_functions.py diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 8638fc6d29..4355fea442 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -90,6 +90,7 @@ ) from dpctl.tensor._reshape import reshape from dpctl.tensor._search_functions import where +from dpctl.tensor._statistical_functions import mean, std, var from dpctl.tensor._usmarray import usm_ndarray from dpctl.tensor._utility_functions import all, any @@ -336,6 +337,9 @@ "clip", "logsumexp", "reduce_hypot", + "mean", + "std", + "var", "__array_api_version__", "__array_namespace_info__", ] diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index f797d24b0b..059ba61030 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -83,7 +83,6 @@ def _reduction_over_axis( _reduction_fn, _dtype_supported, _default_reduction_type_fn, - _identity=None, ): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") @@ -106,23 +105,8 @@ def _reduction_over_axis( res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) res_usm_type = x.usm_type - if x.size == 0: - if _identity is None: - raise ValueError("reduction does not support zero-size arrays") - else: - if keepdims: - res_shape = res_shape + (1,) * red_nd - inv_perm = sorted(range(nd), key=lambda d: perm[d]) - res_shape = tuple(res_shape[i] for i in inv_perm) - return dpt.full( - res_shape, - _identity, - dtype=res_dt, - usm_type=res_usm_type, - sycl_queue=q, - ) if red_nd == 0: - return dpt.astype(x, res_dt, copy=False) + return dpt.astype(x, res_dt, copy=True) host_tasks_list = [] if _dtype_supported(inp_dt, res_dt, res_usm_type, q): @@ -251,7 +235,6 @@ def sum(x, axis=None, dtype=None, keepdims=False): tri._sum_over_axis, tri._sum_over_axis_dtype_supported, _default_reduction_dtype, - _identity=0, ) @@ -312,7 +295,6 @@ def prod(x, axis=None, dtype=None, keepdims=False): tri._prod_over_axis, tri._prod_over_axis_dtype_supported, _default_reduction_dtype, - _identity=1, ) @@ -368,7 +350,6 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False): inp_dt, res_dt ), _default_reduction_dtype_fp_types, - _identity=-dpt.inf, ) @@ -424,7 +405,6 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False): inp_dt, res_dt ), _default_reduction_dtype_fp_types, - _identity=0, ) @@ -446,9 +426,19 @@ def _comparison_over_axis(x, axis, keepdims, _reduction_fn): res_dt = x.dtype res_usm_type = x.usm_type if x.size == 0: - raise ValueError("reduction does not support zero-size arrays") + if any([x.shape[i] == 0 for i in axis]): + raise ValueError( + "reduction cannot be performed over zero-size axes" + ) + else: + return dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) if red_nd == 0: - return x + return dpt.copy(x) res = dpt.empty( res_shape, @@ -549,7 +539,17 @@ def _search_over_axis(x, axis, keepdims, _reduction_fn): res_dt = ti.default_device_index_type(exec_q.sycl_device) res_usm_type = x.usm_type if x.size == 0: - raise ValueError("reduction does not support zero-size arrays") + if any([x.shape[i] == 0 for i in axis]): + raise ValueError( + "reduction cannot be performed over zero-size axes" + ) + else: + return dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) if red_nd == 0: return dpt.zeros( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q diff --git a/dpctl/tensor/_statistical_functions.py b/dpctl/tensor/_statistical_functions.py new file mode 100644 index 0000000000..54d748d2d2 --- /dev/null +++ b/dpctl/tensor/_statistical_functions.py @@ -0,0 +1,381 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from numpy.core.numeric import normalize_axis_tuple + +import dpctl +import dpctl.tensor as dpt +import dpctl.tensor._tensor_elementwise_impl as tei +import dpctl.tensor._tensor_impl as ti +import dpctl.tensor._tensor_reductions_impl as tri + + +def _var_impl(x, axis, correction, keepdims): + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + red_nd = len(axis) + perm = perm + list(axis) + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + inp_dt + if inp_dt.kind == "f" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + + deps = [] + host_tasks_list = [] + if inp_dt != res_dt: + buf = dpt.empty_like(x, dtype=res_dt) + ht_e_buf, c_e1 = ti._copy_usm_ndarray_into_usm_ndarray( + src=x, dst=buf, sycl_queue=q + ) + deps.append(c_e1) + host_tasks_list.append(ht_e_buf) + else: + buf = x + # calculate mean + buf2 = dpt.permute_dims(buf, perm) + res_shape = buf2.shape[: nd - red_nd] + # use keepdims=True path for later broadcasting + if red_nd == 0: + mean_ary = dpt.empty_like(buf) + ht_e1, c_e2 = ti._copy_usm_ndarray_into_usm_ndarray( + src=buf, dst=mean_ary, sycl_queue=q + ) + deps.append(c_e2) + host_tasks_list.append(ht_e1) + else: + mean_ary = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + ht_e1, r_e1 = tri._sum_over_axis( + src=buf2, + trailing_dims_to_reduce=red_nd, + dst=mean_ary, + sycl_queue=q, + depends=deps, + ) + host_tasks_list.append(ht_e1) + deps.append(r_e1) + + mean_ary_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + mean_ary = dpt.permute_dims( + dpt.reshape(mean_ary, mean_ary_shape), inv_perm + ) + # divide in-place to get mean + mean_ary_shape = mean_ary.shape + nelems_ary = dpt.asarray( + nelems, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + if nelems_ary.shape != mean_ary_shape: + nelems_ary = dpt.broadcast_to(nelems_ary, mean_ary_shape) + ht_e2, d_e1 = tei._divide_inplace( + lhs=mean_ary, rhs=nelems_ary, sycl_queue=q, depends=deps + ) + host_tasks_list.append(ht_e2) + # subtract mean from original array to get deviations + dev_ary = dpt.empty_like(buf) + if mean_ary_shape != buf.shape: + mean_ary = dpt.broadcast_to(mean_ary, buf.shape) + ht_e4, su_e = tei._subtract( + src1=buf, src2=mean_ary, dst=dev_ary, sycl_queue=q, depends=[d_e1] + ) + host_tasks_list.append(ht_e4) + # square deviations + ht_e5, sq_e = tei._square( + src=dev_ary, dst=dev_ary, sycl_queue=q, depends=[su_e] + ) + host_tasks_list.append(ht_e5) + deps2 = [] + # take sum of squared deviations + dev_ary2 = dpt.permute_dims(dev_ary, perm) + if red_nd == 0: + res = dev_ary + deps2.append(sq_e) + else: + res = dpt.empty( + res_shape, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) + ht_e6, r_e2 = tri._sum_over_axis( + src=dev_ary2, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=q, + depends=[sq_e], + ) + host_tasks_list.append(ht_e6) + deps2.append(r_e2) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + res_shape = res.shape + # when nelems - correction <= 0, yield nans + div = max(nelems - correction, 0) + if not div: + div = dpt.nan + div_ary = dpt.asarray(div, res_dt, usm_type=res_usm_type, sycl_queue=q) + # divide in-place again + if div_ary.shape != res_shape: + div_ary = dpt.broadcast_to(div_ary, res.shape) + ht_e7, d_e2 = tei._divide_inplace( + lhs=res, rhs=div_ary, sycl_queue=q, depends=deps2 + ) + host_tasks_list.append(ht_e7) + return res, [d_e2], host_tasks_list + + +def mean(x, axis=None, keepdims=False): + """mean(x, axis=None, keepdims=False) + + Calculates the arithmetic mean of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the arithmetic means must be computed. If + a tuple of unique integers, the means are computed over multiple + axes. If `None`, the mean is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the arithmetic means. If the mean was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a floating-point data type, the returned array will have + the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim + if axis is None: + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + perm = [] + nelems = 1 + for i in range(nd): + if i not in axis: + perm.append(i) + else: + nelems *= x.shape[i] + sum_nd = len(axis) + perm = perm + list(axis) + arr2 = dpt.permute_dims(x, perm) + res_shape = arr2.shape[: nd - sum_nd] + q = x.sycl_queue + inp_dt = x.dtype + res_dt = ( + x.dtype + if x.dtype.kind in "fc" + else dpt.dtype(ti.default_device_fp_type(q)) + ) + res_usm_type = x.usm_type + if sum_nd == 0: + return dpt.astype(x, res_dt, copy=True) + + s_e = [] + host_tasks_list = [] + if tri._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e1, r_e = tri._sum_over_axis( + src=arr2, trailing_dims_to_reduce=sum_nd, dst=res, sycl_queue=q + ) + host_tasks_list.append(ht_e1) + s_e.append(r_e) + else: + tmp = dpt.empty( + arr2.shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_cpy, cpy_e = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr2, dst=tmp, sycl_queue=q + ) + host_tasks_list.append(ht_e_cpy) + res = dpt.empty( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q + ) + ht_e_red, r_e = tri._sum_over_axis( + src=tmp, + trailing_dims_to_reduce=sum_nd, + dst=res, + sycl_queue=q, + depends=[cpy_e], + ) + host_tasks_list.append(ht_e_red) + s_e.append(r_e) + + if keepdims: + res_shape = res_shape + (1,) * sum_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + + res_shape = res.shape + # in-place divide + den_dt = dpt.finfo(res_dt).dtype if res_dt.kind == "c" else res_dt + nelems_arr = dpt.asarray( + nelems, dtype=den_dt, usm_type=res_usm_type, sycl_queue=q + ) + if nelems_arr.shape != res_shape: + nelems_arr = dpt.broadcast_to(nelems_arr, res_shape) + ht_e2, _ = tei._divide_inplace( + lhs=res, rhs=nelems_arr, sycl_queue=q, depends=s_e + ) + host_tasks_list.append(ht_e2) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def var(x, axis=None, correction=0.0, keepdims=False): + """var(x, axis=None, correction=0.0, keepdims=False) + + Calculates the variance of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the variances must be computed. If a tuple + of unique integers, the variances are computed over multiple axes. + If `None`, the variance is computed over the entire array. + Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + variance is `N-correction`, where `N` corresponds to the total + number of elements over which the variance is calculated. + Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the variances. If the variance was computed + over the entire array, a zero-dimensional array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`, got" + f"{type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`var` does not support complex types") + + res, _, host_tasks_list = _var_impl(x, axis, correction, keepdims) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res + + +def std(x, axis=None, correction=0.0, keepdims=False): + """std(x, axis=None, correction=0.0, keepdims=False) + + Calculates the standard deviation of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which the standard deviations must be computed. + If a tuple of unique integers, the standard deviations are computed + over multiple axes. If `None`, the standard deviation is computed + over the entire array. Default: `None`. + correction (Optional[float, int]): + degrees of freedom adjustment. The divisor used in calculating the + standard deviation is `N-correction`, where `N` corresponds to the + total number of elements over which the standard deviation is + calculated. Default: `0.0`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input array according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the standard deviations. If the standard + deviation was computed over the entire array, a zero-dimensional + array is returned. + + If `x` has a real-valued floating-point data type, the returned + array will have the same data type as `x`. + If `x` has a boolean or integral data type, the returned array + will have the default floating point data type for the device + where input array `x` is allocated. + """ + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + if not isinstance(correction, (int, float)): + raise TypeError( + "Expected a Python integer or float for `correction`," + f"got {type(x)}" + ) + + if x.dtype.kind == "c": + raise ValueError("`std` does not support complex types") + + res, deps, host_tasks_list = _var_impl(x, axis, correction, keepdims) + ht_ev, _ = tei._sqrt( + src=res, dst=res, sycl_queue=res.sycl_queue, depends=deps + ) + host_tasks_list.append(ht_ev) + dpctl.SyclEvent.wait_for(host_tasks_list) + return res diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 6651483c6c..adbf96be10 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1009,6 +1009,9 @@ template class custom_reduction_over_group_temps_strided_krn; +template +class reduction_over_group_temps_empty_krn; + template class single_reduction_axis0_temps_contig_krn; @@ -1120,6 +1123,31 @@ sycl::event reduction_over_group_temps_strided_impl( constexpr resTy identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const py::ssize_t *const &res_shape = iter_shape_and_strides; + const py::ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class reduction_over_group_temps_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -1244,7 +1272,7 @@ sycl::event reduction_over_group_temps_strided_impl( resTy *partially_reduced_tmp2 = nullptr; if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); + throw std::runtime_error("Unable to allocate device_memory"); } else { partially_reduced_tmp2 = @@ -1501,6 +1529,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( constexpr resTy identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -1632,7 +1667,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( resTy *partially_reduced_tmp2 = nullptr; if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); + throw std::runtime_error("Unable to allocate device_memory"); } else { partially_reduced_tmp2 = @@ -1879,6 +1914,13 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( constexpr resTy identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -2015,7 +2057,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( resTy *partially_reduced_tmp2 = nullptr; if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); + throw std::runtime_error("Unable to allocate device_memory"); } else { partially_reduced_tmp2 = @@ -2712,12 +2754,16 @@ struct TypePairSupportDataForSumReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int8_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint8_t td_ns::TypePairDefinedEntry, @@ -2727,11 +2773,15 @@ struct TypePairSupportDataForSumReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int16_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint16_t td_ns::TypePairDefinedEntry, @@ -2739,20 +2789,30 @@ struct TypePairSupportDataForSumReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int64_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, - // input uint32_t + // input uint64_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input half td_ns::TypePairDefinedEntry, @@ -2967,12 +3027,16 @@ struct TypePairSupportDataForProductReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int8_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint8_t td_ns::TypePairDefinedEntry, @@ -2982,11 +3046,15 @@ struct TypePairSupportDataForProductReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int16_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint16_t td_ns::TypePairDefinedEntry, @@ -2994,20 +3062,30 @@ struct TypePairSupportDataForProductReductionTemps td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint32_t td_ns::TypePairDefinedEntry, td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input int64_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input uint32_t td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, // input half td_ns::TypePairDefinedEntry, @@ -3957,6 +4035,8 @@ template class custom_search_over_group_temps_strided_krn; +template class search_empty_krn; + template ::value; constexpr resTy idx_identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const py::ssize_t *const &res_shape = iter_shape_and_strides; + const py::ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class search_empty_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = idx_identity_val; + }); + }); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -4590,6 +4694,13 @@ sycl::event search_axis1_over_group_temps_contig_impl( constexpr argTy identity_val = su_ns::Identity::value; constexpr resTy idx_identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); @@ -5005,6 +5116,13 @@ sycl::event search_axis0_over_group_temps_contig_impl( constexpr argTy identity_val = su_ns::Identity::value; constexpr resTy idx_identity_val = su_ns::Identity::value; + if (reduction_nelems == 0) { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(idx_identity_val), iter_nelems, depends); + + return res_init_ev; + } + const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index f1b924dd47..5aafe38a40 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -205,6 +205,10 @@ std::pair py_reduction_over_axis( size_t dst_nelems = dst.get_size(); + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { reduction_nelems *= static_cast(src_shape_ptr[i]); @@ -551,6 +555,10 @@ std::pair py_tree_reduction_over_axis( size_t dst_nelems = dst.get_size(); + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { reduction_nelems *= static_cast(src_shape_ptr[i]); @@ -842,6 +850,10 @@ std::pair py_search_over_axis( size_t dst_nelems = dst.get_size(); + if (dst_nelems == 0) { + return std::make_pair(sycl::event(), sycl::event()); + } + size_t reduction_nelems(1); for (int i = dst_nd; i < src_nd; ++i) { reduction_nelems *= static_cast(src_shape_ptr[i]); diff --git a/dpctl/tests/test_tensor_statistical_functions.py b/dpctl/tests/test_tensor_statistical_functions.py new file mode 100644 index 0000000000..8916833f86 --- /dev/null +++ b/dpctl/tests/test_tensor_statistical_functions.py @@ -0,0 +1,254 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl.tensor as dpt +from dpctl.tensor._tensor_impl import default_device_fp_type +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + +_no_complex_dtypes = [ + "?", + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", +] + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +def test_mean_dtypes(dt): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.mean(x) + assert res == 1 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +@pytest.mark.parametrize("dt", _no_complex_dtypes) +@pytest.mark.parametrize("py_zero", [float(0), int(0)]) +def test_std_var_dtypes(dt, py_zero): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dt, q) + + x = dpt.ones(10, dtype=dt) + res = dpt.std(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + res = dpt.var(x, correction=py_zero) + assert res == 0 + if x.dtype.kind in "biu": + assert res.dtype == dpt.dtype(default_device_fp_type(q)) + else: + assert res.dtype == x.dtype + + +def test_stat_fns_axis(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1)) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 6) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1)) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +@pytest.mark.parametrize("fn", [dpt.mean, dpt.var]) +def test_stat_fns_empty(fn): + get_queue_or_skip() + x = dpt.empty((0,), dtype="f4") + r = fn(x) + assert r.shape == tuple() + assert dpt.isnan(r) + + x = dpt.empty((10, 0, 2), dtype="f4") + r = fn(x, axis=1) + assert r.shape == (10, 2) + assert dpt.all(dpt.isnan(r)) + + r = fn(x, axis=0) + assert r.shape == (0, 2) + assert r.size == 0 + + +def test_stat_fns_keepdims(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="f4") + m = dpt.mean(x, axis=(1, 2, -1), keepdims=True) + + assert isinstance(m, dpt.usm_ndarray) + assert m.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(m, dpt.asarray(1, dtype=m.dtype)) + + s = dpt.var(x, axis=(1, 2, -1), keepdims=True) + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 1, 1, 6, 1) + assert dpt.allclose(s, dpt.asarray(0, dtype=s.dtype)) + + +def test_stat_fns_empty_axis(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(3 * 4 * 5, dtype="f4"), (3, 4, 5)) + m = dpt.mean(x, axis=()) + + assert x.shape == m.shape + assert dpt.all(x == m) + + s = dpt.var(x, axis=()) + assert x.shape == s.shape + assert dpt.all(s == 0) + + d = dpt.std(x, axis=()) + assert x.shape == d.shape + assert dpt.all(d == 0) + + +def test_mean(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + m = dpt.mean(x) + expected = dpt.asarray(4, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=0) + expected = dpt.arange(3, 6, dtype="f4") + assert dpt.allclose(m, expected) + + m = dpt.mean(x, axis=1) + expected = dpt.asarray([1, 4, 7], dtype="f4") + assert dpt.allclose(m, expected) + + +def test_var_std(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + r = dpt.var(x) + expected = dpt.asarray(6.666666507720947, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, correction=3) + expected1 = dpt.asarray(10.0, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, correction=3) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=0) + expected = dpt.full(x.shape[1], 6, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=0, correction=1) + expected1 = dpt.full(x.shape[1], 9, dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=0) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=0, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + r = dpt.var(x, axis=1) + expected = dpt.full(x.shape[0], 0.6666666865348816, dtype="f4") + assert dpt.allclose(r, expected) + + r1 = dpt.var(x, axis=1, correction=1) + expected1 = dpt.ones(x.shape[0], dtype="f4") + assert dpt.allclose(r1, expected1) + + r = dpt.std(x, axis=1) + expected = dpt.sqrt(expected) + assert dpt.allclose(r, expected) + + r1 = dpt.std(x, axis=1, correction=1) + expected1 = dpt.sqrt(expected1) + assert dpt.allclose(r1, expected1) + + +def test_var_axis_length_correction(): + get_queue_or_skip() + + x = dpt.reshape(dpt.arange(9, dtype="f4"), (3, 3)) + + r = dpt.var(x, correction=x.size) + assert dpt.isnan(r) + + r = dpt.var(x, axis=0, correction=x.shape[0]) + assert dpt.all(dpt.isnan(r)) + + r = dpt.var(x, axis=1, correction=x.shape[1]) + assert dpt.all(dpt.isnan(r)) + + +def test_stat_function_errors(): + d = dict() + with pytest.raises(TypeError): + dpt.var(d) + with pytest.raises(TypeError): + dpt.std(d) + with pytest.raises(TypeError): + dpt.mean(d) + + x = dpt.empty(1, dtype="f4") + with pytest.raises(TypeError): + dpt.var(x, axis=d) + with pytest.raises(TypeError): + dpt.std(x, axis=d) + with pytest.raises(TypeError): + dpt.mean(x, axis=d) + + with pytest.raises(TypeError): + dpt.var(x, correction=d) + with pytest.raises(TypeError): + dpt.std(x, correction=d) + + x = dpt.empty(1, dtype="c8") + with pytest.raises(ValueError): + dpt.var(x) + with pytest.raises(ValueError): + dpt.std(x)