diff --git a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h index dada8ea67..e2a3fa88c 100644 --- a/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h +++ b/mlir-tensorrt/executor/include/mlir-executor-c/Runtime/Runtime.h @@ -32,6 +32,8 @@ #include #include +#include "cuda_runtime.h" + #ifdef __cplusplus extern "C" { #endif @@ -323,24 +325,17 @@ mtrtScalarValueGetType(MTRT_ScalarValue scalar, MTRT_ScalarTypeCode *code); // MTRT_GpuAllocator //===----------------------------------------------------------------------===// +// Function pointer types for the allocate and deallocate callbacks. +typedef void *(*AllocateFunc)(void *self, uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream); +typedef bool (*DeallocateFunc)(void *self, void *memory, cudaStream_t* stream); -// Function pointer types for the allocate and deallocate callbacks -typedef void* (*AllocateFunc)(void* self, uint64_t size); -typedef bool (*DeallocateFunc)(void* self, void* memory); - -// The MTRT_GpuAllocator struct typedef struct MTRT_GpuAllocator { - void* ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our case) - AllocateFunc allocate; // Function pointer for allocation - DeallocateFunc deallocate; // Function pointer for deallocation + void *ptr; // Pointer to the implementation (PyGpuAllocatorTrampoline in our + // case.) + AllocateFunc allocate; // Function pointer for allocation + DeallocateFunc deallocate; // Function pointer for deallocation } MTRT_GpuAllocator; -/// Checks nullity of `GpuAllocator`. -MTRT_CAPI_EXPORTED bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator); - -MTRT_CAPI_EXPORTED MTRT_Status -GpuAllocatorDestroy(MTRT_GpuAllocator executable); - //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h index 393a5a091..536619ba7 100644 --- a/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h +++ b/mlir-tensorrt/executor/include/mlir-executor/Support/Allocators.h @@ -36,16 +36,24 @@ class GpuAllocator { public: GpuAllocator() = default; virtual ~GpuAllocator() = default; - virtual void* allocate(uint64_t const size) { return nullptr; } - virtual bool deallocate(void *const memory) { return false; } + virtual void *allocate(uint64_t const size, uint64_t const alignment, + uint32_t flags, cudaStream_t* stream) { + return nullptr; + } + virtual bool deallocate(void *const memory, + cudaStream_t* stream) { + return false; + } }; class CustomTensorRTAllocator : public GpuAllocator { public: CustomTensorRTAllocator() = default; ~CustomTensorRTAllocator() = default; - void* allocate(uint64_t const size) override; - bool deallocate(void *const memory) override; + void *allocate(uint64_t const size, uint64_t const alignment, uint32_t flags, + cudaStream_t* stream) override; + bool deallocate(void *const memory, + cudaStream_t* stream) override; }; //===----------------------------------------------------------------------===// diff --git a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp index 253817934..7e7eb73be 100644 --- a/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp +++ b/mlir-tensorrt/executor/lib/CAPI/Runtime/Runtime.cpp @@ -601,29 +601,6 @@ MTRT_ScalarValue mtrtRuntimeValueDynCastToScalar(MTRT_RuntimeValue v) { return wrap(static_cast(x)); } -//===----------------------------------------------------------------------===// -// MTRT_GpuAllocator -//===----------------------------------------------------------------------===// - -bool GpuAllocatorIsNull(MTRT_GpuAllocator gpuAllocator) { - return !gpuAllocator.ptr; -} - -MTRT_Status GpuAllocatorDestroy(MTRT_GpuAllocator gpuAllocator) { - // delete unwrap(gpuAllocator); - return mtrtStatusGetOk(); -} - -// TODO: Implement destroy method to release resources. -// void mtrtGpuAllocatorDestroy(MTRT_GpuAllocator* allocator) { -// if (allocator && allocator->ptr) { -// delete static_cast(allocator->ptr); -// allocator->ptr = nullptr; -// allocator->allocate = nullptr; -// allocator->deallocate = nullptr; -// } -// } - //===----------------------------------------------------------------------===// // MTRT_RuntimeSessionOptions //===----------------------------------------------------------------------===// @@ -660,12 +637,12 @@ class GpuAllocatorWrapper : public GpuAllocator { GpuAllocatorWrapper(MTRT_GpuAllocator gpuAllocator) : mPyGpuAllocator(gpuAllocator) {} - void *allocate(uint64_t size) override { - return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size); + void *allocate(uint64_t size, uint64_t alignment, uint32_t flags, cudaStream_t* stream) override { + return mPyGpuAllocator.allocate(mPyGpuAllocator.ptr, size, alignment, flags, stream); } - bool deallocate(void *ptr) override { - return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr); + bool deallocate(void *ptr, cudaStream_t* stream) override { + return mPyGpuAllocator.deallocate(mPyGpuAllocator.ptr, ptr, stream); } // Static method to create a GpuAllocator from MTRT_GpuAllocator diff --git a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp index fed3a573e..1ae3b4440 100644 --- a/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp +++ b/mlir-tensorrt/executor/lib/Runtime/Backend/Lua/Modules/TensorRT/TensorRTModule.cpp @@ -68,29 +68,21 @@ class StdioLogger : public nvinfer1::ILogger { // TensorRTCallBackAllocator //===----------------------------------------------------------------------===// -class TensorRTCallBackAllocator final : public nvinfer1::IGpuAllocator { +class TensorRTCallBackAllocator final : public nvinfer1::IGpuAsyncAllocator { public: TensorRTCallBackAllocator(GpuAllocator *gpuAllocator) - : nvinfer1::IGpuAllocator(), mGpuAllocatorCallBack(gpuAllocator) {} + : nvinfer1::IGpuAsyncAllocator(), mGpuAllocatorCallBack(gpuAllocator) {} - void *allocate(uint64_t size, uint64_t alignment, - nvinfer1::AllocatorFlags flags) noexcept final { - return allocateAsync(size, alignment, flags, nullptr); - } - - bool deallocate(void *memory) noexcept final { - return deallocateAsync(memory, nullptr); - } - - void *allocateAsync(uint64_t const size, uint64_t const /*alignment*/, - uint32_t /*flags*/, cudaStream_t /*stream*/) noexcept final { - void* result = mGpuAllocatorCallBack->allocate(size); + void *allocateAsync(uint64_t const size, uint64_t const alignment, + uint32_t flags, cudaStream_t stream) noexcept final { + void *result = + mGpuAllocatorCallBack->allocate(size, alignment, flags, &stream); return result; } bool deallocateAsync(void *const memory, - cudaStream_t /*stream*/) noexcept override { - bool result = mGpuAllocatorCallBack->deallocate(memory); + cudaStream_t stream) noexcept override { + bool result = mGpuAllocatorCallBack->deallocate(memory, &stream); return result; } diff --git a/mlir-tensorrt/executor/lib/Support/Allocators.cpp b/mlir-tensorrt/executor/lib/Support/Allocators.cpp index 70f21b9f7..100cb0361 100644 --- a/mlir-tensorrt/executor/lib/Support/Allocators.cpp +++ b/mlir-tensorrt/executor/lib/Support/Allocators.cpp @@ -46,14 +46,46 @@ using namespace mlirtrt; // CustomTensorRTAllocator //===----------------------------------------------------------------------===// -void *CustomTensorRTAllocator::allocate(uint64_t const size) { + +void* +CustomTensorRTAllocator::allocate(uint64_t const size, uint64_t const alignment, + uint32_t /*flags*/, + cudaStream_t* stream) { uint8_t *memory; - cudaMalloc(reinterpret_cast(&memory), size); + assert(alignment > 0 && (alignment & (alignment - 1)) == 0 && + "Memory alignment has to be power of 2"); + if (stream && *stream != nullptr) { + auto status = cudaMallocAsync(reinterpret_cast(&memory), size, *stream); + assert(status == cudaSuccess); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Asynchronously allocated %lx bytes at 0x%lx on stream %lx", size, + reinterpret_cast(memory), + reinterpret_cast(*stream)); + } else { + auto status = cudaMalloc(reinterpret_cast(&memory), size); + assert(status == cudaSuccess); + MTRT_DBGF("[CustomTensorRTAllocator][allocate]: Synchronously allocated %lx bytes at 0x%lx", size, + reinterpret_cast(memory)); + } + assert(reinterpret_cast(memory) % alignment == 0); return memory; } -bool CustomTensorRTAllocator::deallocate(void *const memory) { - cudaFree(memory); +bool CustomTensorRTAllocator::deallocate(void *const memory, + cudaStream_t* stream) { + if (stream && *stream != nullptr) { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Asynchronously freeing CUDA device memory 0x%lx on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + cudaError_t status = cudaFreeAsync(memory, *stream); + assert(status == cudaSuccess); + } else { + MTRT_DBGF("[CustomTensorRTAllocator][deallocate]: Synchronously freeing CUDA device/pinned host memory 0x%lx ptr " + "on stream %lx", + reinterpret_cast(memory), + reinterpret_cast(*stream)); + cudaError_t status = cudaFree(memory); + assert(status == cudaSuccess); + } return true; } diff --git a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp index 94ebfb2d4..3f45138c8 100644 --- a/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp +++ b/mlir-tensorrt/python/bindings/Runtime/RuntimePyBind.cpp @@ -128,6 +128,7 @@ class PyStream : public PyMTRTWrapper { public: using Base::Base; DECLARE_WRAPPER_CONSTRUCTORS(PyStream); + static constexpr auto kMethodTable = CAPITable{ mtrtStreamIsNull, mtrtStreamDestroy, mtrtPythonCapsuleToStream, mtrtPythonStreamToCapsule}; @@ -195,7 +196,8 @@ class PyGpuAllocator { PyGpuAllocator(py::object self) : pySelf(self) {} virtual ~PyGpuAllocator() = default; - virtual std::uintptr_t allocate(uint64_t size) = 0; + virtual std::uintptr_t allocate(uint64_t size, uint64_t alignment, + uint32_t flags) = 0; virtual bool deallocate(std::uintptr_t ptr) = 0; // Creates a C-compatible struct for interfacing with lower-level APIs. @@ -203,16 +205,19 @@ class PyGpuAllocator { private: // Trampoline function: Routes C-style allocation calls to C++ virtual method. - static void *pyGpuAllocatorAllocate(void *self, uint64_t size) { + static void *pyGpuAllocatorAllocate(void *self, uint64_t size, + uint64_t alignment, uint32_t flags, + cudaStream_t* /*stream*/) { py::gil_scoped_acquire acquire; auto *allocator = static_cast(self); - std::uintptr_t ptr = allocator->allocate(size); + std::uintptr_t ptr = allocator->allocate(size, alignment, flags); return reinterpret_cast(ptr); } // Trampoline function: Routes C-style deallocation calls to C++ virtual // method. - static bool pyGpuAllocatorDeallocate(void *self, void *memory) { + static bool pyGpuAllocatorDeallocate(void *self, void *memory, + cudaStream_t* /*stream*/) { py::gil_scoped_acquire acquire; auto *allocator = static_cast(self); return allocator->deallocate(reinterpret_cast(memory)); @@ -237,12 +242,12 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator { // Trampoline for allocate: Dispatches call to Python implementation if // overridden. - uintptr_t allocate(uint64_t size) override { + uintptr_t allocate(uint64_t size, uint64_t alignment, uint32_t flags) override { PYBIND11_OVERRIDE_PURE(uintptr_t, // Return type PyGpuAllocator, // Parent class allocate, // Name of function in C++ - size // Arguments - ); + size, // Arguments + alignment, flags); } // Trampoline for deallocate: Dispatches call to Python implementation if @@ -251,8 +256,7 @@ class PyGpuAllocatorTrampoline : public PyGpuAllocator { PYBIND11_OVERRIDE_PURE(bool, // Return type PyGpuAllocator, // Parent class deallocate, // Name of function in C++ - ptr // Arguments - ); + ptr); // Arguments } }; diff --git a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py index ebf31bde7..1fa932fec 100644 --- a/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py +++ b/mlir-tensorrt/test/python/IntegrationTests/test_stablehlo_add.py @@ -20,7 +20,7 @@ def __init__(self): super().__init__(self) self.allocations = {} # Keep track of allocations - def allocate(self, size): + def allocate(self, size, alignment, flags): # Allocate memory on the GPU using CuPy mem = cp.cuda.alloc(size) ptr = int(mem.ptr) # Convert to integer