From 18be11fd59cd3bf1082170ca638ebdfa384e7ed6 Mon Sep 17 00:00:00 2001 From: xjx <30485581+flutist@users.noreply.github.com> Date: Mon, 16 Mar 2026 23:10:42 +0800 Subject: [PATCH] [BUGFIX]fix CUDA OOM ERROR : invalid argument at cumem_allocator.cpp:119 (#35594) Signed-off-by: xjx <493337577@qq.com> --- csrc/cumem_allocator.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp index 58ce8f71a..0b720d356 100644 --- a/csrc/cumem_allocator.cpp +++ b/csrc/cumem_allocator.cpp @@ -109,16 +109,18 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, #ifndef USE_ROCM int flag = 0; - CUDA_CHECK(cuDeviceGetAttribute( + CUresult rdma_result = cuDeviceGetAttribute( &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, - device)); - if (flag) { // support GPUDirect RDMA if possible + device); + if (rdma_result == CUDA_SUCCESS && + flag) { // support GPUDirect RDMA if possible prop.allocFlags.gpuDirectRDMACapable = 1; } int fab_flag = 0; - CUDA_CHECK(cuDeviceGetAttribute( - &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); - if (fab_flag) { // support fabric handle if possible + CUresult fab_result = cuDeviceGetAttribute( + &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device); + if (fab_result == CUDA_SUCCESS && + fab_flag) { // support fabric handle if possible prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; } #endif