[Misc] Add override for allreduce fusion thresholds (#23639)
Signed-off-by: Julien Lin <jullin@nvidia.com>
This commit is contained in:
11
vllm/envs.py
11
vllm/envs.py
@@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
@@ -1046,6 +1047,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
|
||||
lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
|
||||
|
||||
# Specifies the thresholds of the communicated tensor sizes under which
|
||||
# vllm should use flashinfer fused allreduce. The variable should be a
|
||||
# JSON with the following format:
|
||||
# { <world size>: <max size in mb> }
|
||||
# Unspecified world sizes will fallback to
|
||||
# { 2: 64, 4: 1, <everything else>: 0.5 }
|
||||
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB":
|
||||
lambda: json.loads(os.getenv(
|
||||
"VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB", "{}")),
|
||||
|
||||
# MoE routing strategy selector.
|
||||
# See `RoutingSimulator.get_available_strategies()` # for available
|
||||
# strategies.
|
||||
|
||||
Reference in New Issue
Block a user