diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index 623ff5913..d55b30599 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -62,6 +62,11 @@ FI_ALLREDUCE_FUSION_MAX_SIZE_MB: dict[int, dict[int, float]] = { 4: 32, # 32MB 8: 1, # 1MB }, + 103: { + 2: 64, # 64MB + 4: 64, # 64MB + 8: 2, # 2MB + }, } # Max size of the input tensor per world size per device capability @@ -78,6 +83,11 @@ _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES_MB: dict[int, dict[int, float]] = { 4: 4, # 4MB 8: 1, # 1MB }, + 103: { + 2: 32, # 32MB + 4: 4, # 4MB + 8: 2, # 2MB + }, } diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py index 3c347ef75..9777be5aa 100644 --- a/vllm/distributed/device_communicators/all_reduce_utils.py +++ b/vllm/distributed/device_communicators/all_reduce_utils.py @@ -44,6 +44,12 @@ CUSTOM_ALL_REDUCE_MAX_SIZES = { 6: 1 * MiB, # 1 MB 8: 1 * MiB, # 1 MB }, + "10.3": { + 2: 4 * MiB, # 4 MB + 4: 4 * MiB, # 4 MB + 6: 8 * MiB, # 8 MB + 8: 4 * MiB, # 4 MB + }, } SYMM_MEM_ALL_REDUCE_MAX_SIZES = { @@ -59,6 +65,12 @@ SYMM_MEM_ALL_REDUCE_MAX_SIZES = { 6: 128 * MiB, # 128 MB 8: 128 * MiB, # 128 MB }, + "10.3": { + 2: 4 * MiB, # 4 MB + 4: 32 * MiB, # 32 MB + 6: 32 * MiB, # 32 MB + 8: 64 * MiB, # 64 MB + }, } # NCCL symmetric memory allreduce configuration based on H100 and GB200 benchmarks. diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py index 98c7ac20a..c25ff8cf1 100644 --- a/vllm/distributed/device_communicators/symm_mem.py +++ b/vllm/distributed/device_communicators/symm_mem.py @@ -28,6 +28,7 @@ class SymmMemCommunicator: _WORLD_SIZES_MULTIMEM = { "9.0": [4, 6, 8], "10.0": [6, 8], + "10.3": [6, 8], } def __init__(