[gpt-oss] flashinfer mxfp4 (#22339)

Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
This commit is contained in:
Yongye Zhu
2025-08-06 12:37:27 -07:00
committed by GitHub
parent 31f5dc5b2a
commit 31f09c615f
5 changed files with 453 additions and 3 deletions

View File

@@ -154,6 +154,8 @@ if TYPE_CHECKING:
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False
VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
def get_default_cache_root():
@@ -932,6 +934,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_USE_FLASHINFER_MOE_FP4":
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
# If set to 1, use the FlashInfer
# MXFP8 (activation) x MXFP4 (weight) MoE backend.
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
# If set to 1, use the FlashInfer
# BF16 (activation) x MXFP4 (weight) MoE backend.
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
# Control the cache sized used by the xgrammar compiler. The default
# of 512 MB should be enough for roughly 1000 JSON schemas.
# It can be changed with this variable if needed for some reason.