[BUGFIX]fix CUDA OOM ERROR : invalid argument at cumem_allocator.cpp:119 (#35594)

Signed-off-by: xjx <493337577@qq.com>
This commit is contained in:
xjx
2026-03-16 23:10:42 +08:00
committed by GitHub
parent 8d8855fdae
commit 18be11fd59

View File

@@ -109,16 +109,18 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem,
#ifndef USE_ROCM #ifndef USE_ROCM
int flag = 0; int flag = 0;
CUDA_CHECK(cuDeviceGetAttribute( CUresult rdma_result = cuDeviceGetAttribute(
&flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED, &flag, CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED,
device)); device);
if (flag) { // support GPUDirect RDMA if possible if (rdma_result == CUDA_SUCCESS &&
flag) { // support GPUDirect RDMA if possible
prop.allocFlags.gpuDirectRDMACapable = 1; prop.allocFlags.gpuDirectRDMACapable = 1;
} }
int fab_flag = 0; int fab_flag = 0;
CUDA_CHECK(cuDeviceGetAttribute( CUresult fab_result = cuDeviceGetAttribute(
&fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device);
if (fab_flag) { // support fabric handle if possible if (fab_result == CUDA_SUCCESS &&
fab_flag) { // support fabric handle if possible
prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
} }
#endif #endif