[Perf] Eliminate padding and slicing op for GPT-OSS with Flashinfer MXFP4 MXFP8 MoE (#30647)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
This commit is contained in:
elvischenv
2026-03-18 23:01:26 +08:00
committed by GitHub
parent c373b5c00d
commit 296839a1b0
6 changed files with 40 additions and 3 deletions

View File

@@ -82,6 +82,10 @@ def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn):
f"attention backend '{attn_backend.backend.name}'"
)
# TODO: remove this after finishing migration from envs to model kwargs
if model_name == "openai/gpt-oss-20b":
monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")