Skip to content
This repository has been archived by the owner on Jan 13, 2025. It is now read-only.

Commit

Permalink
Refactor level1 nrm2 operator (#466)
Browse files Browse the repository at this point in the history
Improved nrm2 performance by using a single kernel for reduction
  • Loading branch information
s-Nick authored Oct 16, 2023
1 parent fe44d70 commit dd587dd
Show file tree
Hide file tree
Showing 9 changed files with 139 additions and 11 deletions.
3 changes: 2 additions & 1 deletion benchmark/portblas/blas1/asum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
auto vr_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
auto copy_temp =
blas::helper::copy_to_device<scalar_t>(q, &vr_temp, vr_temp_gpu, 1);
auto asum_event = _asum(sb_handle, size, inx, static_cast<index_t>(1), vr_temp_gpu);
auto asum_event = _asum(sb_handle, size, inx, static_cast<index_t>(1),
vr_temp_gpu, {copy_temp});
sb_handle.wait(asum_event);
auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1);
sb_handle.wait(copy_output);
Expand Down
3 changes: 2 additions & 1 deletion benchmark/portblas/blas1/nrm2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
scalar_t vr_temp = 0;
{
auto vr_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(1, q);
auto copy_init = blas::helper::copy_to_device(q, &vr_temp, vr_temp_gpu, 1);
auto nrm2_event =
_nrm2(sb_handle, size, inx, static_cast<index_t>(1), vr_temp_gpu);
_nrm2(sb_handle, size, inx, static_cast<index_t>(1), vr_temp_gpu, {copy_init});
sb_handle.wait(nrm2_event);
auto copy_output = blas::helper::copy_to_host(q, vr_temp_gpu, &vr_temp, 1);
sb_handle.wait(copy_output);
Expand Down
13 changes: 13 additions & 0 deletions include/interface/blas1_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,18 @@ typename sb_handle_t::event_t _nrm2(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t &_dependencies);

/*!
* \brief Prototype for the internal implementation of the NRM2 operator. See
* documentation in the blas1_interface.hpp file for details.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
typename sb_handle_t::event_t _nrm2_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
const typename sb_handle_t::event_t &_dependencies);

/**
* @brief _rot constructor given plane rotation
* @param sb_handle SB_Handle
Expand Down Expand Up @@ -463,6 +475,7 @@ template <typename sb_handle_t, typename container_t, typename index_t,
typename ValueType<container_t>::type _nrm2(
sb_handle_t &sb_handle, index_t _N, container_t _vx, increment_t _incx,
const typename sb_handle_t::event_t &_dependencies);

} // namespace internal

template <typename sb_handle_t, typename container_0_t, typename container_1_t,
Expand Down
21 changes: 21 additions & 0 deletions src/interface/blas1/backend/amd_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@ typename sb_handle_t::event_t _asum(
} // namespace backend
} // namespace asum

namespace nrm2 {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _nrm2(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 18)) {
constexpr index_t localSize = 1024;
const index_t number_WG = (_N + localSize - 1) / localSize;
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 512;
return blas::internal::_nrm2_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
}
} // namespace backend
} // namespace nrm2
} // namespace blas

#endif
14 changes: 14 additions & 0 deletions src/interface/blas1/backend/default_cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,20 @@ typename sb_handle_t::event_t _asum(
} // namespace backend
} // namespace asum

namespace nrm2 {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _nrm2(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
constexpr int localSize = 8;
constexpr index_t number_WG = 16;
return blas::internal::_nrm2_impl<localSize, 0>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
} // namespace backend
} // namespace nrm2
} // namespace blas

#endif
16 changes: 16 additions & 0 deletions src/interface/blas1/backend/intel_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,22 @@ typename sb_handle_t::event_t _asum(
} // namespace backend
} // namespace asum

namespace nrm2 {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _nrm2(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
constexpr index_t localSize = 128;
const index_t number_WG =
std::min((_N + localSize - 1) / localSize, static_cast<index_t>(512));
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
} // namespace backend
} // namespace nrm2

} // namespace blas

#endif
26 changes: 25 additions & 1 deletion src/interface/blas1/backend/nvidia_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ typename sb_handle_t::event_t _asum(
const index_t number_WG = (_N < (1 << 18))
? (_N + localSize - 1) / localSize
: static_cast<index_t>(256);
return blas::internal::_asum_impl<static_cast<index_t>(localSize), 32>(
return blas::internal::_asum_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
Expand All @@ -51,6 +51,30 @@ typename sb_handle_t::event_t _asum(
} // namespace backend
} // namespace asum

namespace nrm2 {
namespace backend {
template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename index_t, typename increment_t>
typename sb_handle_t::event_t _nrm2(
sb_handle_t& sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t& _dependencies) {
if (_N < (1 << 23)) {
constexpr index_t localSize = 512;
const index_t number_WG = (_N < (1 << 18))
? (_N + localSize - 1) / localSize
: static_cast<index_t>(256);
return blas::internal::_nrm2_impl<static_cast<int>(localSize), 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
} else {
constexpr int localSize = 512;
constexpr index_t number_WG = 1024;
return blas::internal::_nrm2_impl<localSize, 32>(
sb_handle, _N, _vx, _incx, _rs, number_WG, _dependencies);
}
}
} // namespace backend
} // namespace nrm2

} // namespace blas

#endif
50 changes: 44 additions & 6 deletions src/interface/blas1_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,17 +367,51 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
typename sb_handle_t::event_t _nrm2(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const typename sb_handle_t::event_t &_dependencies) {
return blas::nrm2::backend::_nrm2(sb_handle, _N, _vx, _incx, _rs,
_dependencies);
}

/*! _nrm2_impl.
* @brief Internal implementation of the nrm2 operator.
*
* This function contains the code that sets up and executes the kernels
* required to perform the nrm2 operation.
*
* This function is called by blas::internal::backend::nrm2 which, dependent on
* the platform being compiled for and other parameters, provides different
* template parameters to ensure the most optimal kernel is constructed.
*
* @tparam localSize specifies the number of threads per work group used by
* the kernel
* @tparam localMemSize specifies the size of local shared memory to use, which
* is device and implementation dependent. If 0 the
* implementation use a kernel implementation which doesn't
* require local memory.
*/
template <int localSize, int localMemSize, typename sb_handle_t,
typename container_0_t, typename container_1_t, typename index_t,
typename increment_t>
typename sb_handle_t::event_t _nrm2_impl(
sb_handle_t &sb_handle, index_t _N, container_0_t _vx, increment_t _incx,
container_1_t _rs, const index_t number_WG,
const typename sb_handle_t::event_t &_dependencies) {
typename VectorViewType<container_0_t, index_t, increment_t>::type vx =
make_vector_view(_vx, _incx, _N);
auto rs = make_vector_view(_rs, static_cast<increment_t>(1),
static_cast<index_t>(1));
auto prdOp = make_op<UnaryOp, SquareOperator>(vx);

const auto localSize = sb_handle.get_work_group_size();
const auto nWG = 2 * localSize;
auto assignOp =
make_assign_reduction<AddOperator>(rs, prdOp, localSize, localSize * nWG);
auto ret0 = sb_handle.execute(assignOp, _dependencies);
auto assignOp = make_wg_atomic_reduction<AddOperator>(rs, prdOp);
typename sb_handle_t::event_t ret0;
if constexpr (localMemSize != 0) {
ret0 = sb_handle.execute(assignOp, static_cast<index_t>(localSize),
static_cast<index_t>(number_WG * localSize),
static_cast<index_t>(localMemSize), _dependencies);
} else {
ret0 = sb_handle.execute(assignOp, static_cast<index_t>(localSize),
static_cast<index_t>(number_WG * localSize),
_dependencies);
}
auto sqrtOp = make_op<UnaryOp, SqrtOperator>(rs);
auto assignOpFinal = make_op<Assign>(rs, sqrtOp);
auto ret1 = sb_handle.execute(assignOpFinal, ret0);
Expand Down Expand Up @@ -861,8 +895,12 @@ typename ValueType<container_t>::type _nrm2(
auto gpu_res = blas::helper::allocate < is_usm ? helper::AllocType::usm
: helper::AllocType::buffer,
element_t > (static_cast<index_t>(1), sb_handle.get_queue());
typename sb_handle_t::event_t copy_init_val = {blas::helper::copy_to_device(
sb_handle.get_queue(), res.data(), gpu_res, 1)};
const auto local_deps =
concatenate_vectors(_dependencies, copy_init_val);
auto nrm2_event =
blas::internal::_nrm2(sb_handle, _N, _vx, _incx, gpu_res, _dependencies);
blas::internal::_nrm2(sb_handle, _N, _vx, _incx, gpu_res, local_deps);
sb_handle.wait(nrm2_event);
auto event =
blas::helper::copy_to_host(sb_handle.get_queue(), gpu_res, res.data(), 1);
Expand Down
4 changes: 2 additions & 2 deletions test/unittest/blas1/blas1_nrm2_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ void run_test(const combination_t<scalar_t> combi) {
fill_random(x_v);

// Output scalar
scalar_t out_s = 10.0;
scalar_t out_cpu_s = 20.0;
scalar_t out_s = 0.0;
scalar_t out_cpu_s = 0.0;

// Reference implementation
if (incX < 0) {
Expand Down

0 comments on commit dd587dd

Please sign in to comment.