From 61e381dcf01f25cc8a006ecf0ba9c31dde662b42 Mon Sep 17 00:00:00 2001
From: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com>
Date: Sun, 22 Mar 2026 03:43:47 +0800
Subject: [PATCH] [Perf] Add SM 10.3 (B300/GB300) all-reduce communicator
 tuning (#37756)

Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com>
---
 .../passes/fusion/allreduce_rms_fusion.py            | 10 ++++++++++
 .../device_communicators/all_reduce_utils.py         | 12 ++++++++++++
 vllm/distributed/device_communicators/symm_mem.py    |  1 +
 3 files changed, 23 insertions(+)

diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
index 623ff5913..d55b30599 100644
--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -62,6 +62,11 @@ FI_ALLREDUCE_FUSION_MAX_SIZE_MB: dict[int, dict[int, float]] = {
         4: 32,  # 32MB
         8: 1,  # 1MB
     },
+    103: {
+        2: 64,  # 64MB
+        4: 64,  # 64MB
+        8: 2,  # 2MB
+    },
 }
 
 # Max size of the input tensor per world size per device capability
@@ -78,6 +83,11 @@ _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB: dict[int, dict[int, float]] = {
         4: 4,  # 4MB
         8: 1,  # 1MB
     },
+    103: {
+        2: 32,  # 32MB
+        4: 4,  # 4MB
+        8: 2,  # 2MB
+    },
 }
 
 
diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
index 3c347ef75..9777be5aa 100644
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -44,6 +44,12 @@ CUSTOM_ALL_REDUCE_MAX_SIZES = {
         6: 1 * MiB,  # 1 MB
         8: 1 * MiB,  # 1 MB
     },
+    "10.3": {
+        2: 4 * MiB,  # 4 MB
+        4: 4 * MiB,  # 4 MB
+        6: 8 * MiB,  # 8 MB
+        8: 4 * MiB,  # 4 MB
+    },
 }
 
 SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
@@ -59,6 +65,12 @@ SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
         6: 128 * MiB,  # 128 MB
         8: 128 * MiB,  # 128 MB
     },
+    "10.3": {
+        2: 4 * MiB,  # 4 MB
+        4: 32 * MiB,  # 32 MB
+        6: 32 * MiB,  # 32 MB
+        8: 64 * MiB,  # 64 MB
+    },
 }
 
 # NCCL symmetric memory allreduce configuration based on H100 and GB200 benchmarks.
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 98c7ac20a..c25ff8cf1 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -28,6 +28,7 @@ class SymmMemCommunicator:
     _WORLD_SIZES_MULTIMEM = {
         "9.0": [4, 6, 8],
         "10.0": [6, 8],
+        "10.3": [6, 8],
     }
 
     def __init__(