diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp index 6c2c18a66..58ce8f71a 100644 --- a/csrc/cumem_allocator.cpp +++ b/csrc/cumem_allocator.cpp @@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, if (flag) { // support GPUDirect RDMA if possible prop.allocFlags.gpuDirectRDMACapable = 1; } + int fab_flag = 0; + CUDA_CHECK(cuDeviceGetAttribute( + &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (fab_flag) { // support fabric handle if possible + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + } #endif #ifndef USE_ROCM // Allocate memory using cuMemCreate - CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0); + if (ret) { + if (fab_flag && + (ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) { + // Fabric allocation may fail without multi-node nvlink, + // fallback to POSIX file descriptor + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + } else { + CUDA_CHECK(ret); + } + } if (error_code != 0) { return; }