From f8c92b858ff86a6ed7b3a873106c56442fcd543c Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Tue, 1 Oct 2024 16:46:40 -0500 Subject: [PATCH] ch4/ofi: Convert CUDA device id to handle for fi_mr_regattr Libfabric docs say that the value of the cuda field in the regattr struct is the device handle gotten from cuDeviceGet, not the ordinal. Fixes pmodels/mpich#7148. --- src/mpid/ch4/netmod/ofi/ofi_impl.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 8803810fc13..1dc2c024755 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -707,8 +707,16 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_register_memory(char *send_buf, size_t da mr_attr.context = NULL; #ifdef MPL_HAVE_CUDA mr_attr.iface = (attr->type != MPL_GPU_POINTER_DEV) ? FI_HMEM_SYSTEM : FI_HMEM_CUDA; - mr_attr.device.cuda = - (attr->type != MPL_GPU_POINTER_DEV) ? 0 : MPL_gpu_get_dev_id_from_attr(attr); + if (attr->type == MPL_GPU_POINTER_DEV) { + MPL_gpu_device_handle_t dev_h; + int dev_id; + + MPL_gpu_get_dev_id_from_attr(attr); + MPL_gpu_device_id_to_handle(&dev_h, dev_id); + mr_attr.device.cuda = dev_h; + } else { + mr_attr.device.cuda = 0; + } #elif defined MPL_HAVE_ZE /* OFI does not support tiles yet, need to pass the root device. */ mr_attr.iface = (attr->type != MPL_GPU_POINTER_DEV) ? FI_HMEM_SYSTEM : FI_HMEM_ZE;