[Misc] Add override for allreduce fusion thresholds (#23639)

Signed-off-by: Julien Lin <jullin@nvidia.com>
2025-08-26 23:53:04 +08:00
parent 9d4183dd2e
commit 7ea22e42d5
2 changed files with 24 additions and 0 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import hashlib
+import json
 import os
 import sys
 import tempfile
@@ -1046,6 +1047,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
    lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),

+    # Specifies the thresholds of the communicated tensor sizes under which
+    # vllm should use flashinfer fused allreduce. The variable should be a
+    # JSON with the following format:
+    #     { <world size>: <max size in mb> }
+    # Unspecified world sizes will fallback to
+    #     { 2: 64, 4: 1, <everything else>: 0.5 }
+    "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
+    lambda: json.loads(os.getenv(
+        "VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")),
+
    # MoE routing strategy selector.
    # See `RoutingSimulator.get_available_strategies()` # for available
    # strategies.