[Distributed] Add enable_expert_parallel arg (#14305)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
committed by
GitHub
parent
cd579352bf
commit
cc2f9b32c8
@@ -86,7 +86,6 @@ if TYPE_CHECKING:
|
||||
VLLM_MLA_PERFORM_MATRIX_ABSORPTION: bool = True
|
||||
VLLM_MLA_DISABLE_REQUANTIZATION: bool = False
|
||||
VLLM_MLA_CUDA_MEM_ALIGN_KV_CACHE: bool = True
|
||||
VLLM_TEST_ENABLE_EP: bool = False
|
||||
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
|
||||
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
|
||||
VLLM_RAY_BUNDLE_INDICES: str = ""
|
||||
@@ -579,12 +578,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
lambda: bool(int(os.getenv("VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
|
||||
),
|
||||
|
||||
# If set, vLLM will use the experimental expert parallel implementation on
|
||||
# the FusedMoE layer, using tensor parallelism size as expert parallelism
|
||||
# size.
|
||||
"VLLM_TEST_ENABLE_EP":
|
||||
lambda: bool(int(os.getenv("VLLM_TEST_ENABLE_EP", "0"))),
|
||||
|
||||
# Number of GPUs per worker in Ray, if it is set to be a fraction,
|
||||
# it allows ray to schedule multiple actors on a single GPU,
|
||||
# so that users can colocate other actors on the same GPUs as vLLM.
|
||||
|
||||
Reference in New Issue
Block a user