[UX] Add --moe-backend arg for explicit kernel selection (#33807)
Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
@@ -2,13 +2,25 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from collections.abc import Callable
|
||||
from typing import Any
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import Field, field_validator
|
||||
|
||||
from vllm.config.utils import config
|
||||
from vllm.utils.hashing import safe_hash
|
||||
|
||||
MoEBackend = Literal[
|
||||
"auto",
|
||||
"triton",
|
||||
"deep_gemm",
|
||||
"cutlass",
|
||||
"flashinfer_trtllm",
|
||||
"flashinfer_cutlass",
|
||||
"flashinfer_cutedsl",
|
||||
"marlin",
|
||||
"aiter",
|
||||
]
|
||||
|
||||
|
||||
@config
|
||||
class KernelConfig:
|
||||
@@ -17,6 +29,26 @@ class KernelConfig:
|
||||
enable_flashinfer_autotune: bool = Field(default=None)
|
||||
"""If True, run FlashInfer autotuning during kernel warmup."""
|
||||
|
||||
moe_backend: MoEBackend = "auto"
|
||||
"""Backend for MoE expert computation kernels. Available options:
|
||||
|
||||
- "auto": Automatically select the best backend based on model and hardware\n
|
||||
- "triton": Use Triton-based fused MoE kernels\n
|
||||
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
|
||||
- "cutlass": Use vLLM CUTLASS kernels\n
|
||||
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
|
||||
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
|
||||
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
|
||||
- "marlin": Use Marlin kernels (weight-only quantization)\n
|
||||
- "aiter": Use AMD AITer kernels (ROCm only)"""
|
||||
|
||||
@field_validator("moe_backend", mode="before")
|
||||
@classmethod
|
||||
def _normalize_moe_backend(cls, value: Any) -> Any:
|
||||
if isinstance(value, str):
|
||||
return value.lower().replace("-", "_")
|
||||
return value
|
||||
|
||||
def compute_hash(self) -> str:
|
||||
"""
|
||||
WARNING: Whenever a new field is added to this config,
|
||||
|
||||
Reference in New Issue
Block a user