[gpt-oss] flashinfer mxfp4 (#22339)
Signed-off-by: simon-mo <xmo@berkeley.edu> Signed-off-by: Yongye Zhu <zyy1102000@gmail.com> Co-authored-by: simon-mo <xmo@berkeley.edu>
This commit is contained in:
12
vllm/envs.py
12
vllm/envs.py
@@ -154,6 +154,8 @@ if TYPE_CHECKING:
|
||||
VLLM_ENABLE_RESPONSES_API_STORE: bool = False
|
||||
VLLM_USE_TRTLLM_CONTEXT_ATTENTION: bool = False
|
||||
VLLM_USE_TRTLLM_DECODE_ATTENTION: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8: bool = False
|
||||
VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@@ -932,6 +934,16 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_USE_FLASHINFER_MOE_FP4":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))),
|
||||
|
||||
# If set to 1, use the FlashInfer
|
||||
# MXFP8 (activation) x MXFP4 (weight) MoE backend.
|
||||
"VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "0"))),
|
||||
|
||||
# If set to 1, use the FlashInfer
|
||||
# BF16 (activation) x MXFP4 (weight) MoE backend.
|
||||
"VLLM_USE_FLASHINFER_MOE_MXFP4_BF16":
|
||||
lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16", "0"))),
|
||||
|
||||
# Control the cache sized used by the xgrammar compiler. The default
|
||||
# of 512 MB should be enough for roughly 1000 JSON schemas.
|
||||
# It can be changed with this variable if needed for some reason.
|
||||
|
||||
Reference in New Issue
Block a user