Fix NUMA binding on non-CDMM Grace-Blackwell systems (#39361)
Signed-off-by: Qidong Su <soodoshll@gmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -661,7 +661,18 @@ class NvmlCudaPlatform(CudaPlatformBase):
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(physical_device_id)
|
||||
|
||||
try:
|
||||
return pynvml.nvmlDeviceGetNumaNodeId(handle)
|
||||
numa_node = pynvml.nvmlDeviceGetNumaNodeId(handle)
|
||||
if cls._numa_node_has_cpus(numa_node):
|
||||
return numa_node
|
||||
# On non-CDMM Grace-Blackwell systems (e.g. GB200), each GPU's HBM
|
||||
# is a separate NUMA node with no CPUs. Fall through to
|
||||
# CPU-affinity-based detection to find the nearest CPU node.
|
||||
logger.debug(
|
||||
"NUMA node %d for GPU %d has no CPUs (non-CDMM topology), "
|
||||
"falling back to CPU-affinity-based detection",
|
||||
numa_node,
|
||||
device_id,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -681,6 +692,17 @@ class NvmlCudaPlatform(CudaPlatformBase):
|
||||
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _numa_node_has_cpus(cls, node_id: int) -> bool:
|
||||
"""Check whether a NUMA node has any CPUs assigned to it."""
|
||||
from pathlib import Path
|
||||
|
||||
cpulist_file = Path(f"/sys/devices/system/node/node{node_id}/cpulist")
|
||||
try:
|
||||
return cpulist_file.read_text().strip() != ""
|
||||
except (OSError, ValueError):
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _get_device_cpu_affinity(cls, handle) -> list[int]:
|
||||
"""Get the list of CPU IDs associated with a GPU via NVML."""
|
||||
|
||||
Reference in New Issue
Block a user