Skip to content

Commit

Permalink
[BLAS] allow device pointers in cublas output scalars (#304)
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewtbarker authored May 9, 2023
1 parent 004b2d4 commit 52a4ccd
Show file tree
Hide file tree
Showing 10 changed files with 374 additions and 65 deletions.
186 changes: 171 additions & 15 deletions src/blas/backends/cublas/cublas_level1.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,8 @@ inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, in
using cuDataType2 = typename CudaEquivalentType<T2>::Type;
overflow_check(n, incx);

bool result_on_device =
sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -633,9 +635,15 @@ inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, in
auto handle = sc.get_handle(queue);
auto x_ = reinterpret_cast<const cuDataType1 *>(x);
auto res_ = reinterpret_cast<cuDataType2 *>(result);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
cublasStatus_t err;
// ASUM does not support negative index
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
}
});
});
return done;
Expand Down Expand Up @@ -752,6 +760,22 @@ inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1
T1 *s, const std::vector<sycl::event> &dependencies) {
using cuDataType1 = typename CudaEquivalentType<T1>::Type;
using cuDataType2 = typename CudaEquivalentType<T2>::Type;
auto ctx = queue.get_context();
bool results_on_device =
(sycl::get_pointer_type(a, ctx) == sycl::usm::alloc::device ||
sycl::get_pointer_type(b, ctx) == sycl::usm::alloc::device ||
sycl::get_pointer_type(c, ctx) == sycl::usm::alloc::device ||
sycl::get_pointer_type(s, ctx) == sycl::usm::alloc::device);
if (results_on_device) {
if (sycl::get_pointer_type(a, ctx) == sycl::usm::alloc::unknown
sycl::get_pointer_type(b, ctx) == sycl::usm::alloc::unknown ||
sycl::get_pointer_type(c, ctx) == sycl::usm::alloc::unknown ||
sycl::get_pointer_type(s, ctx) == sycl::usm::alloc::unknown) {
throw oneapi::mkl::exception(
"blas", "rotg",
"If any pointer is only device accessible, all must be device accessible");
}
}
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -763,8 +787,14 @@ inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1
auto b_ = reinterpret_cast<cuDataType1 *>(b);
auto c_ = reinterpret_cast<cuDataType2 *>(c);
auto s_ = reinterpret_cast<cuDataType1 *>(s);
if (results_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, a_, b_, c_, s_);
if (results_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
}
});
});
return done;
Expand Down Expand Up @@ -856,6 +886,8 @@ inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int
const std::vector<sycl::event> &dependencies) {
using cuDataType = typename CudaEquivalentType<T>::Type;
overflow_check(n, incx, incy);
bool result_on_device =
sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -866,8 +898,14 @@ inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int
auto x_ = reinterpret_cast<const cuDataType *>(x);
auto y_ = reinterpret_cast<const cuDataType *>(y);
auto res_ = reinterpret_cast<cuDataType *>(result);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, res_);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
});
});
return done;
Expand Down Expand Up @@ -931,7 +969,9 @@ sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int6
const float *y, int64_t incy, float *result,
const std::vector<sycl::event> &dependencies) {
overflow_check(n, incx, incy);
// cuBLAS does not support sdot so we need to mimic sdot.
bool result_on_device =
sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
// cuBLAS does not support sdsdot so we need to mimic sdot.
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -942,13 +982,32 @@ sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int6
auto x_ = reinterpret_cast<const float *>(x);
auto y_ = reinterpret_cast<const float *>(y);
auto res_ = reinterpret_cast<float *>(result);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
cublasStatus_t err;
CUBLAS_ERROR_FUNC_SYNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
}
});
});
done.wait();
result[0] = result[0] + sb;
return done;
if (result_on_device) {
// The following does copy device to host and then host to device
// just to adjust with sb constant. This is pretty inefficient, and
// should maybe be replaced with a sycl GPU kernel, but it duplicated what
// is done in the buffer API
float host_result;
queue.memcpy(&host_result, result, sizeof(float)).wait();
host_result += sb;
auto last_ev = queue.memcpy(result, &host_result, sizeof(float));
return last_ev;
}
else {
result[0] = result[0] + sb;
return done;
}
}

sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
Expand All @@ -960,6 +1019,25 @@ template <typename Func, typename T>
inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T *d1, T *d2, T *x1,
T y1, T *param, const std::vector<sycl::event> &dependencies) {
using cuDataType = typename CudaEquivalentType<T>::Type;
auto ctx = queue.get_context();
bool results_on_device =
(sycl::get_pointer_type(d1, ctx) == sycl::usm::alloc::device ||
sycl::get_pointer_type(d2, ctx) == sycl::usm::alloc::device ||
sycl::get_pointer_type(x1, ctx) == sycl::usm::alloc::device);
if (results_on_device) {
if (sycl::get_pointer_type(d1, ctx) == sycl::usm::alloc::unknown ||
sycl::get_pointer_type(d2, ctx) == sycl::usm::alloc::unknown ||
sycl::get_pointer_type(x1, ctx) == sycl::usm::alloc::unknown) {
throw oneapi::mkl::exception(
"blas", "rotmg",
"If any pointer is only device accessible, all must be device accessible");
}
}
cuDataType *y1_;
if (results_on_device) {
y1_ = sycl::malloc_device<cuDataType>(1, queue);
queue.memcpy(y1_, &y1, sizeof(cuDataType)).wait();
}
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -970,12 +1048,24 @@ inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T
auto d1_ = reinterpret_cast<cuDataType *>(d1);
auto d2_ = reinterpret_cast<cuDataType *>(d2);
auto x1_ = reinterpret_cast<cuDataType *>(x1);
auto y1_ = reinterpret_cast<const cuDataType *>(&y1);
auto param_ = reinterpret_cast<cuDataType *>(param);
cublasStatus_t err;
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_, param_);
if (results_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_, param_);
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
}
else {
auto y1_c = reinterpret_cast<const cuDataType *>(&y1);
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_c, param_);
}
});
});
if (results_on_device) {
done.wait();
queue.memcpy(&y1, y1_, sizeof(cuDataType)).wait();
sycl::free(y1_, queue);
}
return done;
}

Expand All @@ -1001,7 +1091,15 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i
// This change may cause failure as the result of integer overflow
// based on the size.
int int_res = 0;
int *int_res_p = &int_res;
int *int_res_p = nullptr;
bool result_on_device =
sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
if (result_on_device) {
int_res_p = sycl::malloc_device<int>(1, queue);
}
else {
int_res_p = &int_res;
}
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -1010,16 +1108,37 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto x_ = reinterpret_cast<const cuDataType *>(x);
auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
cublasStatus_t err;
// For negative incx, iamax returns 0. This behaviour is similar to that of
// reference iamax.
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p_);
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
}
});
});
done.wait();
result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
return done;
if (result_on_device) {
// The following does copy device to host and then host to device
// just to adjust to 0-base indexing. This is pretty inefficient, and
// should maybe be replaced with a sycl GPU kernel, but it duplicated what
// is done in the buffer API
int host_int;
int64_t host_int64;
queue.memcpy(&host_int, int_res_p, sizeof(int)).wait();
host_int64 = std::max((int64_t)host_int - 1, int64_t{ 0 });
auto last_ev = queue.memcpy(result, &host_int64, sizeof(int64_t));
last_ev.wait();
sycl::free(int_res_p, queue);
return last_ev;
}
else {
result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
return done;
}
}

#define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \
Expand Down Expand Up @@ -1079,7 +1198,15 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i
// This change may cause failure as the result of integer overflow
// based on the size.
int int_res = 0;
int *int_res_p = &int_res;
int *int_res_p = nullptr;
bool result_on_device =
sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
if (result_on_device) {
int_res_p = sycl::malloc_device<int>(1, queue);
}
else {
int_res_p = &int_res;
}
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -1088,16 +1215,37 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i
onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
auto handle = sc.get_handle(queue);
auto x_ = reinterpret_cast<const cuDataType *>(x);
auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
cublasStatus_t err;
// For negative incx, iamin returns 0. This behaviour is similar to that of
// implemented iamin.
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p_);
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
}
});
});
done.wait();
result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
return done;
if (result_on_device) {
// The following does copy device to host and then host to device
// just to adjust to 0-base indexing. This is pretty inefficient, and
// should maybe be replaced with a sycl GPU kernel, but it duplicated what
// is done in the buffer API
int host_int;
int64_t host_int64;
queue.memcpy(&host_int, int_res_p, sizeof(int)).wait();
host_int64 = std::max((int64_t)host_int - 1, int64_t{ 0 });
auto last_ev = queue.memcpy(result, &host_int64, sizeof(int64_t));
last_ev.wait();
sycl::free(int_res_p, queue);
return last_ev;
}
else {
result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
return done;
}
}

#define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \
Expand All @@ -1119,6 +1267,8 @@ inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, in
using cuDataType2 = typename CudaEquivalentType<T2>::Type;
overflow_check(n, incx);

bool result_on_device =
sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
auto done = queue.submit([&](sycl::handler &cgh) {
int64_t num_events = dependencies.size();
for (int64_t i = 0; i < num_events; i++) {
Expand All @@ -1128,9 +1278,15 @@ inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, in
auto handle = sc.get_handle(queue);
auto x_ = reinterpret_cast<const cuDataType1 *>(x);
auto res_ = reinterpret_cast<cuDataType2 *>(result);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
}
cublasStatus_t err;
// NRM2 does not support negative index
CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
if (result_on_device) {
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
}
});
});
return done;
Expand Down
14 changes: 14 additions & 0 deletions tests/unit_tests/blas/include/test_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,13 @@ typename std::enable_if<std::is_integral<fp>::value, bool>::type check_equal(fp
return (x == x_ref);
}

template <typename fp>
bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag) {
fp x_host;
queue.memcpy(&x_host, x, sizeof(fp)).wait();
return check_equal(x_host, x_ref, error_mag);
}

template <typename fp>
bool check_equal_trsm(fp x, fp x_ref, int error_mag) {
using fp_real = typename complex_info<fp>::real_type;
Expand Down Expand Up @@ -487,6 +494,13 @@ bool check_equal(fp x, fp x_ref, int error_mag, std::ostream &out) {
return good;
}

template <typename fp>
bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag, std::ostream &out) {
fp x_host;
queue.memcpy(&x_host, x, sizeof(fp)).wait();
return check_equal(x_host, x_ref, error_mag, out);
}

template <typename fp>
bool check_equal_vector(const fp *v, const fp *v_ref, int n, int inc, int error_mag,
std::ostream &out) {
Expand Down
Loading

0 comments on commit 52a4ccd

Please sign in to comment.