[BLAS] allow device pointers in cublas output scalars (#304)

uxlfoundation · May 9, 2023 · 52a4ccd · 52a4ccd
1 parent 004b2d4
commit 52a4ccd
Show file tree

Hide file tree

Showing 10 changed files with 374 additions and 65 deletions.
diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp
@@ -624,6 +624,8 @@ inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, in
     using cuDataType2 = typename CudaEquivalentType<T2>::Type;
     overflow_check(n, incx);
 
+    bool result_on_device =
+        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -633,9 +635,15 @@ inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, in
             auto handle = sc.get_handle(queue);
             auto x_ = reinterpret_cast<const cuDataType1 *>(x);
             auto res_ = reinterpret_cast<cuDataType2 *>(result);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
             cublasStatus_t err;
             // ASUM does not support negative index
             CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+            }
         });
     });
     return done;
@@ -752,6 +760,22 @@ inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1
                         T1 *s, const std::vector<sycl::event> &dependencies) {
     using cuDataType1 = typename CudaEquivalentType<T1>::Type;
     using cuDataType2 = typename CudaEquivalentType<T2>::Type;
+    auto ctx = queue.get_context();
+    bool results_on_device =
+        (sycl::get_pointer_type(a, ctx) == sycl::usm::alloc::device ||
+         sycl::get_pointer_type(b, ctx) == sycl::usm::alloc::device ||
+         sycl::get_pointer_type(c, ctx) == sycl::usm::alloc::device ||
+         sycl::get_pointer_type(s, ctx) == sycl::usm::alloc::device);
+    if (results_on_device) {
+        if (sycl::get_pointer_type(a, ctx) == sycl::usm::alloc::unknown
+            sycl::get_pointer_type(b, ctx) == sycl::usm::alloc::unknown ||
+            sycl::get_pointer_type(c, ctx) == sycl::usm::alloc::unknown ||
+            sycl::get_pointer_type(s, ctx) == sycl::usm::alloc::unknown) {
+            throw oneapi::mkl::exception(
+                "blas", "rotg",
+                "If any pointer is only device accessible, all must be device accessible");
+        }
+    }
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -763,8 +787,14 @@ inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1
             auto b_ = reinterpret_cast<cuDataType1 *>(b);
             auto c_ = reinterpret_cast<cuDataType2 *>(c);
             auto s_ = reinterpret_cast<cuDataType1 *>(s);
+            if (results_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, a_, b_, c_, s_);
+            if (results_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+            }
         });
     });
     return done;
@@ -856,6 +886,8 @@ inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int
                        const std::vector<sycl::event> &dependencies) {
     using cuDataType = typename CudaEquivalentType<T>::Type;
     overflow_check(n, incx, incy);
+    bool result_on_device =
+        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -866,8 +898,14 @@ inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int
             auto x_ = reinterpret_cast<const cuDataType *>(x);
             auto y_ = reinterpret_cast<const cuDataType *>(y);
             auto res_ = reinterpret_cast<cuDataType *>(result);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, res_);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
         });
     });
     return done;
@@ -931,7 +969,9 @@ sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int6
                    const float *y, int64_t incy, float *result,
                    const std::vector<sycl::event> &dependencies) {
     overflow_check(n, incx, incy);
-    // cuBLAS does not support sdot so we need to mimic sdot.
+    bool result_on_device =
+        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
+    // cuBLAS does not support sdsdot so we need to mimic sdot.
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -942,13 +982,32 @@ sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int6
             auto x_ = reinterpret_cast<const float *>(x);
             auto y_ = reinterpret_cast<const float *>(y);
             auto res_ = reinterpret_cast<float *>(result);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
             cublasStatus_t err;
             CUBLAS_ERROR_FUNC_SYNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+            }
         });
     });
     done.wait();
-    result[0] = result[0] + sb;
-    return done;
+    if (result_on_device) {
+        // The following does copy device to host and then host to device
+        // just to adjust with sb constant. This is pretty inefficient, and
+        // should maybe be replaced with a sycl GPU kernel, but it duplicated what
+        // is done in the buffer API
+        float host_result;
+        queue.memcpy(&host_result, result, sizeof(float)).wait();
+        host_result += sb;
+        auto last_ev = queue.memcpy(result, &host_result, sizeof(float));
+        return last_ev;
+    }
+    else {
+        result[0] = result[0] + sb;
+        return done;
+    }
 }
 
 sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y,
@@ -960,6 +1019,25 @@ template <typename Func, typename T>
 inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T *d1, T *d2, T *x1,
                          T y1, T *param, const std::vector<sycl::event> &dependencies) {
     using cuDataType = typename CudaEquivalentType<T>::Type;
+    auto ctx = queue.get_context();
+    bool results_on_device =
+        (sycl::get_pointer_type(d1, ctx) == sycl::usm::alloc::device ||
+         sycl::get_pointer_type(d2, ctx) == sycl::usm::alloc::device ||
+         sycl::get_pointer_type(x1, ctx) == sycl::usm::alloc::device);
+    if (results_on_device) {
+        if (sycl::get_pointer_type(d1, ctx) == sycl::usm::alloc::unknown ||
+            sycl::get_pointer_type(d2, ctx) == sycl::usm::alloc::unknown ||
+            sycl::get_pointer_type(x1, ctx) == sycl::usm::alloc::unknown) {
+            throw oneapi::mkl::exception(
+                "blas", "rotmg",
+                "If any pointer is only device accessible, all must be device accessible");
+        }
+    }
+    cuDataType *y1_;
+    if (results_on_device) {
+        y1_ = sycl::malloc_device<cuDataType>(1, queue);
+        queue.memcpy(y1_, &y1, sizeof(cuDataType)).wait();
+    }
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -970,12 +1048,24 @@ inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T
             auto d1_ = reinterpret_cast<cuDataType *>(d1);
             auto d2_ = reinterpret_cast<cuDataType *>(d2);
             auto x1_ = reinterpret_cast<cuDataType *>(x1);
-            auto y1_ = reinterpret_cast<const cuDataType *>(&y1);
             auto param_ = reinterpret_cast<cuDataType *>(param);
             cublasStatus_t err;
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_, param_);
+            if (results_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+                CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_, param_);
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+            }
+            else {
+                auto y1_c = reinterpret_cast<const cuDataType *>(&y1);
+                CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_c, param_);
+            }
         });
     });
+    if (results_on_device) {
+        done.wait();
+        queue.memcpy(&y1, y1_, sizeof(cuDataType)).wait();
+        sycl::free(y1_, queue);
+    }
     return done;
 }
 
@@ -1001,7 +1091,15 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i
     // This change may cause failure as the result of integer overflow
     // based on the size.
     int int_res = 0;
-    int *int_res_p = &int_res;
+    int *int_res_p = nullptr;
+    bool result_on_device =
+        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
+    if (result_on_device) {
+        int_res_p = sycl::malloc_device<int>(1, queue);
+    }
+    else {
+        int_res_p = &int_res;
+    }
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -1010,16 +1108,37 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i
         onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
             auto handle = sc.get_handle(queue);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
             cublasStatus_t err;
             // For negative incx, iamax returns 0. This behaviour is similar to that of
             // reference iamax.
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p_);
+            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+            }
         });
     });
     done.wait();
-    result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
-    return done;
+    if (result_on_device) {
+        // The following does copy device to host and then host to device
+        // just to adjust to 0-base indexing. This is pretty inefficient, and
+        // should maybe be replaced with a sycl GPU kernel, but it duplicated what
+        // is done in the buffer API
+        int host_int;
+        int64_t host_int64;
+        queue.memcpy(&host_int, int_res_p, sizeof(int)).wait();
+        host_int64 = std::max((int64_t)host_int - 1, int64_t{ 0 });
+        auto last_ev = queue.memcpy(result, &host_int64, sizeof(int64_t));
+        last_ev.wait();
+        sycl::free(int_res_p, queue);
+        return last_ev;
+    }
+    else {
+        result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
+        return done;
+    }
 }
 
 #define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
@@ -1079,7 +1198,15 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i
     // This change may cause failure as the result of integer overflow
     // based on the size.
     int int_res = 0;
-    int *int_res_p = &int_res;
+    int *int_res_p = nullptr;
+    bool result_on_device =
+        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
+    if (result_on_device) {
+        int_res_p = sycl::malloc_device<int>(1, queue);
+    }
+    else {
+        int_res_p = &int_res;
+    }
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -1088,16 +1215,37 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i
         onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) {
             auto handle = sc.get_handle(queue);
             auto x_ = reinterpret_cast<const cuDataType *>(x);
-            auto int_res_p_ = reinterpret_cast<int *>(int_res_p);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
             cublasStatus_t err;
             // For negative incx, iamin returns 0. This behaviour is similar to that of
             // implemented iamin.
-            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p_);
+            CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, int_res_p);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+            }
         });
     });
     done.wait();
-    result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
-    return done;
+    if (result_on_device) {
+        // The following does copy device to host and then host to device
+        // just to adjust to 0-base indexing. This is pretty inefficient, and
+        // should maybe be replaced with a sycl GPU kernel, but it duplicated what
+        // is done in the buffer API
+        int host_int;
+        int64_t host_int64;
+        queue.memcpy(&host_int, int_res_p, sizeof(int)).wait();
+        host_int64 = std::max((int64_t)host_int - 1, int64_t{ 0 });
+        auto last_ev = queue.memcpy(result, &host_int64, sizeof(int64_t));
+        last_ev.wait();
+        sycl::free(int_res_p, queue);
+        return last_ev;
+    }
+    else {
+        result[0] = std::max((int64_t)(*int_res_p - 1), int64_t{ 0 });
+        return done;
+    }
 }
 
 #define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE)                                                \
@@ -1119,6 +1267,8 @@ inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, in
     using cuDataType2 = typename CudaEquivalentType<T2>::Type;
     overflow_check(n, incx);
 
+    bool result_on_device =
+        sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device;
     auto done = queue.submit([&](sycl::handler &cgh) {
         int64_t num_events = dependencies.size();
         for (int64_t i = 0; i < num_events; i++) {
@@ -1128,9 +1278,15 @@ inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, in
             auto handle = sc.get_handle(queue);
             auto x_ = reinterpret_cast<const cuDataType1 *>(x);
             auto res_ = reinterpret_cast<cuDataType2 *>(result);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
+            }
             cublasStatus_t err;
             // NRM2 does not support negative index
             CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_);
+            if (result_on_device) {
+                cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
+            }
         });
     });
     return done;

diff --git a/tests/unit_tests/blas/include/test_common.hpp b/tests/unit_tests/blas/include/test_common.hpp
@@ -460,6 +460,13 @@ typename std::enable_if<std::is_integral<fp>::value, bool>::type check_equal(fp
     return (x == x_ref);
 }
 
+template <typename fp>
+bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag) {
+    fp x_host;
+    queue.memcpy(&x_host, x, sizeof(fp)).wait();
+    return check_equal(x_host, x_ref, error_mag);
+}
+
 template <typename fp>
 bool check_equal_trsm(fp x, fp x_ref, int error_mag) {
     using fp_real = typename complex_info<fp>::real_type;
@@ -487,6 +494,13 @@ bool check_equal(fp x, fp x_ref, int error_mag, std::ostream &out) {
     return good;
 }
 
+template <typename fp>
+bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag, std::ostream &out) {
+    fp x_host;
+    queue.memcpy(&x_host, x, sizeof(fp)).wait();
+    return check_equal(x_host, x_ref, error_mag, out);
+}
+
 template <typename fp>
 bool check_equal_vector(const fp *v, const fp *v_ref, int n, int inc, int error_mag,
                         std::ostream &out) {