[Kernel][perf] optimize NCCL symm_mem vs custom_AR selection thresholds (#33839)

Signed-off-by: <>
Signed-off-by: pkousha <43781676+pkousha@users.noreply.github.com>
Co-authored-by: Pouya Kousha <pkousha@login-eos01.eos.clusters.nvidia.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
This commit is contained in:
pkousha
2026-02-26 14:35:58 -08:00
committed by GitHub
parent eb19955c37
commit 31fb6f43da

View File

@@ -27,6 +27,7 @@ from vllm.utils.torch_utils import cuda_device_count_stateless
logger = init_logger(__name__)
KiB = 1024
MiB = 1024 * 1024
# Max size for each world size in case symmetric memory is available
# For different SM architectures
@@ -60,17 +61,44 @@ SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
},
}
# NCCL symmetric memory allreduce configuration based on H100 and GB200 benchmarks.
# PyNCCL-symm outperforms custom_AR for small and large tensor sizes,
# while custom_AR wins for mid-range sizes.
#
# Benchmark results (8 GPUs):
# 2K - 16K: PyNCCL-symm wins (1.35x - 1.48x faster)
# 32K - 64K: custom_AR wins
# 128K - 1G: PyNCCL-symm wins (1.12x - 6.14x faster)
#
# Benchmark results (4 GPUs):
# 2K - 16K: PyNCCL-symm wins (1.21x - 1.30x faster)
# 32K - 256K: custom_AR wins (1.07x - 1.35x faster)
# 512K - 1G: PyNCCL-symm wins (1.10x - 2.32x faster)
#
# The config defines ranges where custom_AR is preferred (symm_mem disabled).
NCCL_SYMM_MEM_ALL_REDUCE_CONFIG: dict[str, Any] = {
"min_world_size": 4,
"thresholds": {
4: 2 * MiB, # 2 MB
8: 1 * MiB, # 1 MB
# Ranges where custom_AR outperforms NCCL symm_mem: (lower_bound, upper_bound)
# NCCL symm_mem will NOT be used for sizes in range: lower < size < upper
"custom_ar_preferred_ranges": {
4: (16 * KiB, 512 * KiB), # custom_AR wins for 32K-256K
8: (16 * KiB, 128 * KiB), # custom_AR wins for 32K-64K
},
"always_use_above_world_size": 8, # Always use symm mem for world_size > 8
}
def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor) -> bool:
"""
Determine if NCCL symmetric memory allreduce should be used.
Based on H100 and GB200 benchmarks, NCCL symm_mem is preferred for:
- Small tensors (≤16K): Lower latency than custom_AR
- Large tensors (≥128K for 8 GPUs, ≥512K for 4 GPUs): Better bandwidth
Custom_AR is preferred for mid-range sizes where its P2P approach
has lower overhead than the symm_mem copy-in/copy-out pattern.
"""
from vllm.distributed.device_communicators.pynccl_allocator import (
is_symmetric_memory_enabled,
)
@@ -80,11 +108,20 @@ def should_nccl_symm_mem_allreduce(world_size: int, input_tensor: torch.Tensor)
if not is_symmetric_memory_enabled():
return False
if world_size < NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["min_world_size"]:
return False
threshold = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["thresholds"].get(world_size)
if threshold is not None and input_tensor.nbytes >= threshold:
return True
tensor_size = input_tensor.nbytes
custom_ar_range = NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["custom_ar_preferred_ranges"].get(
world_size
)
if custom_ar_range is not None:
lower_bound, upper_bound = custom_ar_range
# Use symm_mem for small sizes (≤ lower_bound) and large sizes (≥ upper_bound)
# Use custom_AR (not symm_mem) for mid-range sizes
return tensor_size <= lower_bound or tensor_size >= upper_bound
return world_size > NCCL_SYMM_MEM_ALL_REDUCE_CONFIG["always_use_above_world_size"]