From 528e9b14900fc8a012f2599172e2a4576caafe1a Mon Sep 17 00:00:00 2001 From: Kebe Date: Mon, 2 Feb 2026 23:55:46 +0900 Subject: [PATCH] [Feature][Core] Support Fabric detection to adapt the MNNVL protocol for the GB series (#33540) Signed-off-by: Kebe Signed-off-by: youkaichao Co-authored-by: Thomas Vegas Co-authored-by: youkaichao --- csrc/cumem_allocator.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/csrc/cumem_allocator.cpp b/csrc/cumem_allocator.cpp index 6c2c18a66..58ce8f71a 100644 --- a/csrc/cumem_allocator.cpp +++ b/csrc/cumem_allocator.cpp @@ -115,11 +115,28 @@ void create_and_map(unsigned long long device, ssize_t size, CUdeviceptr d_mem, if (flag) { // support GPUDirect RDMA if possible prop.allocFlags.gpuDirectRDMACapable = 1; } + int fab_flag = 0; + CUDA_CHECK(cuDeviceGetAttribute( + &fab_flag, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, device)); + if (fab_flag) { // support fabric handle if possible + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + } #endif #ifndef USE_ROCM // Allocate memory using cuMemCreate - CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + CUresult ret = (CUresult)cuMemCreate(p_memHandle, size, &prop, 0); + if (ret) { + if (fab_flag && + (ret == CUDA_ERROR_NOT_PERMITTED || ret == CUDA_ERROR_NOT_SUPPORTED)) { + // Fabric allocation may fail without multi-node nvlink, + // fallback to POSIX file descriptor + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + CUDA_CHECK(cuMemCreate(p_memHandle, size, &prop, 0)); + } else { + CUDA_CHECK(ret); + } + } if (error_code != 0) { return; }