Fix NUMA binding on non-CDMM Grace-Blackwell systems (#39361)

Signed-off-by: Qidong Su <soodoshll@gmail.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Qidong Su
2026-04-09 03:36:51 -04:00
committed by GitHub
parent 8a34c5087a
commit ed733802f0

View File

@@ -661,7 +661,18 @@ class NvmlCudaPlatform(CudaPlatformBase):
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
try:
return pynvml.nvmlDeviceGetNumaNodeId(handle)
numa_node = pynvml.nvmlDeviceGetNumaNodeId(handle)
if cls._numa_node_has_cpus(numa_node):
return numa_node
# On non-CDMM Grace-Blackwell systems (e.g. GB200), each GPU's HBM
# is a separate NUMA node with no CPUs. Fall through to
# CPU-affinity-based detection to find the nearest CPU node.
logger.debug(
"NUMA node %d for GPU %d has no CPUs (non-CDMM topology), "
"falling back to CPU-affinity-based detection",
numa_node,
device_id,
)
except Exception:
pass
@@ -681,6 +692,17 @@ class NvmlCudaPlatform(CudaPlatformBase):
return None
@classmethod
def _numa_node_has_cpus(cls, node_id: int) -> bool:
"""Check whether a NUMA node has any CPUs assigned to it."""
from pathlib import Path
cpulist_file = Path(f"/sys/devices/system/node/node{node_id}/cpulist")
try:
return cpulist_file.read_text().strip() != ""
except (OSError, ValueError):
return False
@classmethod
def _get_device_cpu_affinity(cls, handle) -> list[int]:
"""Get the list of CPU IDs associated with a GPU via NVML."""