[UX] Add --moe-backend arg for explicit kernel selection (#33807)

Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
Michael Goin
2026-02-25 20:44:44 -05:00
committed by GitHub
parent 1976356ee6
commit de527e1cec
37 changed files with 260 additions and 140 deletions

View File

@@ -2,13 +2,25 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Callable
from typing import Any
from typing import Any, Literal
from pydantic import Field, field_validator
from vllm.config.utils import config
from vllm.utils.hashing import safe_hash
MoEBackend = Literal[
"auto",
"triton",
"deep_gemm",
"cutlass",
"flashinfer_trtllm",
"flashinfer_cutlass",
"flashinfer_cutedsl",
"marlin",
"aiter",
]
@config
class KernelConfig:
@@ -17,6 +29,26 @@ class KernelConfig:
enable_flashinfer_autotune: bool = Field(default=None)
"""If True, run FlashInfer autotuning during kernel warmup."""
moe_backend: MoEBackend = "auto"
"""Backend for MoE expert computation kernels. Available options:
- "auto": Automatically select the best backend based on model and hardware\n
- "triton": Use Triton-based fused MoE kernels\n
- "deep_gemm": Use DeepGEMM kernels (FP8 block-quantized only)\n
- "cutlass": Use vLLM CUTLASS kernels\n
- "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels\n
- "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels\n
- "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)\n
- "marlin": Use Marlin kernels (weight-only quantization)\n
- "aiter": Use AMD AITer kernels (ROCm only)"""
@field_validator("moe_backend", mode="before")
@classmethod
def _normalize_moe_backend(cls, value: Any) -> Any:
if isinstance(value, str):
return value.lower().replace("-", "_")
return value
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,