[NVIDIA] Add SM100 Flashinfer MoE blockscale fp8 backend for low latency (#20645)

Signed-off-by: kaixih <kaixih@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Kaixi Hou
2025-07-19 02:33:01 -07:00
committed by GitHub
parent 7d94577138
commit 6d0734c562
6 changed files with 187 additions and 31 deletions

View File

@@ -119,7 +119,8 @@ if TYPE_CHECKING:
VLLM_TPU_BUCKET_PADDING_GAP: int = 0
VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None
VLLM_USE_DEEP_GEMM: bool = False
VLLM_USE_FLASHINFER_MOE: bool = False
VLLM_USE_FLASHINFER_MOE_FP8: bool = False
VLLM_USE_FLASHINFER_MOE_FP4: bool = False
VLLM_XGRAMMAR_CACHE_MB: int = 0
VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256
VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False
@@ -854,9 +855,13 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_DEEP_GEMM":
lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))),
# Allow use of FlashInfer MoE kernels for fused moe ops.
"VLLM_USE_FLASHINFER_MOE_FP8":
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))),
# Allow use of FlashInfer CUTLASS kernels for fused moe ops.
"VLLM_USE_FLASHINFER_MOE":
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE", "0"))),
"VLLM_USE_FLASHINFER_MOE_FP4":
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.