diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 2e73c56b3..66a5ee67e 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -661,7 +661,18 @@ class NvmlCudaPlatform(CudaPlatformBase): handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id) try: - return pynvml.nvmlDeviceGetNumaNodeId(handle) + numa_node = pynvml.nvmlDeviceGetNumaNodeId(handle) + if cls._numa_node_has_cpus(numa_node): + return numa_node + # On non-CDMM Grace-Blackwell systems (e.g. GB200), each GPU's HBM + # is a separate NUMA node with no CPUs. Fall through to + # CPU-affinity-based detection to find the nearest CPU node. + logger.debug( + "NUMA node %d for GPU %d has no CPUs (non-CDMM topology), " + "falling back to CPU-affinity-based detection", + numa_node, + device_id, + ) except Exception: pass @@ -681,6 +692,17 @@ class NvmlCudaPlatform(CudaPlatformBase): return None + @classmethod + def _numa_node_has_cpus(cls, node_id: int) -> bool: + """Check whether a NUMA node has any CPUs assigned to it.""" + from pathlib import Path + + cpulist_file = Path(f"/sys/devices/system/node/node{node_id}/cpulist") + try: + return cpulist_file.read_text().strip() != "" + except (OSError, ValueError): + return False + @classmethod def _get_device_cpu_affinity(cls, handle) -> list[int]: """Get the list of CPU IDs associated with a GPU via NVML."""