From 61e381dcf01f25cc8a006ecf0ba9c31dde662b42 Mon Sep 17 00:00:00 2001 From: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> Date: Sun, 22 Mar 2026 03:43:47 +0800 Subject: [PATCH] [Perf] Add SM 10.3 (B300/GB300) all-reduce communicator tuning (#37756) Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> --- .../passes/fusion/allreduce_rms_fusion.py | 10 ++++++++++ .../device_communicators/all_reduce_utils.py | 12 ++++++++++++ vllm/distributed/device_communicators/symm_mem.py | 1 + 3 files changed, 23 insertions(+) diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index 623ff5913..d55b30599 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -62,6 +62,11 @@ FI_ALLREDUCE_FUSION_MAX_SIZE_MB: dict[int, dict[int, float]] = { 4: 32, # 32MB 8: 1, # 1MB }, + 103: { + 2: 64, # 64MB + 4: 64, # 64MB + 8: 2, # 2MB + }, } # Max size of the input tensor per world size per device capability @@ -78,6 +83,11 @@ _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB: dict[int, dict[int, float]] = { 4: 4, # 4MB 8: 1, # 1MB }, + 103: { + 2: 32, # 32MB + 4: 4, # 4MB + 8: 2, # 2MB + }, } diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py index 3c347ef75..9777be5aa 100644 --- a/vllm/distributed/device_communicators/all_reduce_utils.py +++ b/vllm/distributed/device_communicators/all_reduce_utils.py @@ -44,6 +44,12 @@ CUSTOM_ALL_REDUCE_MAX_SIZES = { 6: 1 * MiB, # 1 MB 8: 1 * MiB, # 1 MB }, + "10.3": { + 2: 4 * MiB, # 4 MB + 4: 4 * MiB, # 4 MB + 6: 8 * MiB, # 8 MB + 8: 4 * MiB, # 4 MB + }, } SYMM_MEM_ALL_REDUCE_MAX_SIZES = { @@ -59,6 +65,12 @@ SYMM_MEM_ALL_REDUCE_MAX_SIZES = { 6: 128 * MiB, # 128 MB 8: 128 * MiB, # 128 MB }, + "10.3": { + 2: 4 * MiB, # 4 MB + 4: 32 * MiB, # 32 MB + 6: 32 * MiB, # 32 MB + 8: 64 * MiB, # 64 MB + }, } # NCCL symmetric memory allreduce configuration based on H100 and GB200 benchmarks. diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py index 98c7ac20a..c25ff8cf1 100644 --- a/vllm/distributed/device_communicators/symm_mem.py +++ b/vllm/distributed/device_communicators/symm_mem.py @@ -28,6 +28,7 @@ class SymmMemCommunicator: _WORLD_SIZES_MULTIMEM = { "9.0": [4, 6, 8], "10.0": [6, 8], + "10.3": [6, 8], } def __init__(