Skip to content
This repository has been archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][CUDA] Introduced USM P2P tests #1631

Draft
wants to merge 5 commits into
base: intel
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions SYCL/USM/P2P/p2p_access.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// REQUIRES: cuda
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER

#include <cassert>
#include <sycl/sycl.hpp>

using namespace sycl;

int main() {

// Note that this code will largely be removed: it is temporary due to the
// temporary lack of multiple devices per sycl context in the Nvidia backend.
// A portable implementation, using a single gpu platform, should be possible
// once the Nvidia context issues are resolved.
////////////////////////////////////////////////////////////////////////
std::vector<sycl::device> Devs;
for (const auto &plt : sycl::platform::get_platforms()) {

if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
Devs.push_back(plt.get_devices()[0]);
}
if (Devs.size() < 2) {
std::cout << "Cannot test P2P capabilities, at least two devices are "
"required, exiting."
<< std::endl;
return 0;
}

std::vector<sycl::queue> Queues;
std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
[](const sycl::device &D) { return sycl::queue{D}; });
////////////////////////////////////////////////////////////////////////

if (!Devs[0].ext_oneapi_can_access_peer(
Devs[1], sycl::ext::oneapi::peer_access::access_supported)) {
std::cout << "P2P access is not supported by devices, exiting."
<< std::endl;
return 0;
}

// Enables Devs[0] to access Devs[1] memory.
Devs[0].ext_oneapi_enable_peer_access(Devs[1]);

auto *arr1 = malloc<int>(2, Queues[1], usm::alloc::device);

// Calling fill on Devs[1] data with Devs[0] queue requires P2P enabled.
Queues[0].fill(arr1, 2, 2).wait();

// Access/write Devs[1] data with Devs[0] queue.
Queues[0]
.submit([&](handler &cgh) {
auto myRange = range<1>(1);
auto myKernel = ([=](id<1> idx) { arr1[0] *= 2; });

cgh.parallel_for<class p2p_access>(myRange, myKernel);
})
.wait();

int2 out;

Queues[0].memcpy(&out, arr1, 2 * sizeof(int)).wait();
assert(out[0] == 4);
assert(out[1] == 2);

sycl::free(arr1, Queues[1]);

Devs[0].ext_oneapi_disable_peer_access(Devs[1]);

return 0;
}

// CHECK: ---> piextPeerAccessGetInfo(
// CHECK: ---> piextEnablePeerAccess(
// CHECK: ---> piextDisablePeerAccess(
85 changes: 85 additions & 0 deletions SYCL/USM/P2P/p2p_atomics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// REQUIRES: cuda
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_61 %s -o %t.out
// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER

#include <cassert>
#include <numeric>
#include <sycl/sycl.hpp>
#include <vector>

using namespace sycl;

// number of atomic operations
constexpr size_t N = 512;

int main() {

// Note that this code will largely be removed: it is temporary due to the
// temporary lack of multiple devices per sycl context in the Nvidia backend.
// A portable implementation, using a single gpu platform, should be possible
// once the Nvidia context issues are resolved.
////////////////////////////////////////////////////////////////////////
std::vector<sycl::device> Devs;
for (const auto &plt : sycl::platform::get_platforms()) {

if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
Devs.push_back(plt.get_devices()[0]);
}
if (Devs.size() < 2) {
std::cout << "Cannot test P2P capabilities, at least two devices are "
"required, exiting."
<< std::endl;
return 0;
}

std::vector<sycl::queue> Queues;
std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
[](const sycl::device &D) { return sycl::queue{D}; });
////////////////////////////////////////////////////////////////////////

if (!Devs[1].ext_oneapi_can_access_peer(
Devs[0], sycl::ext::oneapi::peer_access::atomics_supported)) {
std::cout << "P2P atomics are not supported by devices, exiting."
<< std::endl;
return 0;
}

// Enables Devs[1] to access Devs[0] memory.
Devs[1].ext_oneapi_enable_peer_access(Devs[0]);

std::vector<double> input(N);
std::iota(input.begin(), input.end(), 0);

double h_sum = 0.;
for (const auto &value : input) {
h_sum += value;
}

double *d_sum = malloc_shared<double>(1, Queues[0]);
double *d_in = malloc_device<double>(N, Queues[0]);

Queues[0].memcpy(d_in, &input[0], N * sizeof(double));
Queues[0].wait();

range global_range{N};

*d_sum = 0.;
Queues[1].submit([&](handler &h) {
h.parallel_for<class peer_atomic>(global_range, [=](id<1> i) {
sycl::atomic_ref<double, sycl::memory_order::relaxed,
sycl::memory_scope::system,
access::address_space::global_space>(*d_sum) += d_in[i];
});
});
Queues[1].wait();

assert(*d_sum == h_sum);

free(d_sum, Queues[0]);
free(d_in, Queues[0]);

return 0;
}

// CHECK: ---> piextPeerAccessGetInfo(
// CHECK: ---> piextEnablePeerAccess(
78 changes: 78 additions & 0 deletions SYCL/USM/P2P/p2p_copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// REQUIRES: cuda
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %t.out

#include <cassert>
#include <numeric>
#include <sycl/sycl.hpp>
#include <vector>

using namespace sycl;

// Array size to copy
constexpr int N = 100;

int main() {

// Note that this code will largely be removed: it is temporary due to the
// temporary lack of multiple devices per sycl context in the Nvidia backend.
// A portable implementation, using a single gpu platform, should be possible
// once the Nvidia context issues are resolved.
////////////////////////////////////////////////////////////////////////
std::vector<sycl::device> Devs;
for (const auto &plt : sycl::platform::get_platforms()) {

if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
Devs.push_back(plt.get_devices()[0]);
}
if (Devs.size() < 2) {
std::cout << "Cannot test P2P capabilities, at least two devices are "
"required, exiting."
<< std::endl;
return 0;
}

std::vector<sycl::queue> Queues;
std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
[](const sycl::device &D) { return sycl::queue{D}; });
////////////////////////////////////////////////////////////////////////

if (!Devs[0].ext_oneapi_can_access_peer(
Devs[1], sycl::ext::oneapi::peer_access::access_supported)) {
std::cout << "P2P access is not supported by devices, exiting."
<< std::endl;
return 0;
}

// Enables Devs[0] to access Devs[1] memory.
Devs[0].ext_oneapi_enable_peer_access(Devs[1]);

std::vector<int> input(N);
std::iota(input.begin(), input.end(), 0);

int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
Queues[0].memcpy(arr0, &input[0], N * sizeof(int));

int *arr1 = malloc<int>(N, Queues[1], usm::alloc::device);
// P2P copy performed here:
Queues[1].copy(arr0, arr1, N).wait();

int out[N];
Queues[1].copy(arr1, out, N).wait();

sycl::free(arr0, Queues[0]);
sycl::free(arr1, Queues[1]);

bool ok = true;
for (int i = 0; i < N; i++) {
if (out[i] != input[i]) {
printf("%d %d\n", out[i], input[i]);
ok = false;
break;
}
}

printf("%s\n", ok ? "PASS" : "FAIL");

return 0;
}