[Perf] Eliminate padding and slicing op for GPT-OSS with Flashinfer MXFP4 MXFP8 MoE (#30647)

Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2026-03-18 23:01:26 +08:00
parent c373b5c00d
commit 296839a1b0
6 changed files with 40 additions and 3 deletions
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -162,3 +162,12 @@ deepseek_v3_fp8 = ModelFusionInfo(
        # async_tp=n_layers * 2,
    ),
 )
+
+gpt_oss_20b = ModelFusionInfo(
+    model_name="openai/gpt-oss-20b",
+    matches=lambda n_layers: Matches(
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
+    ),
+)