[Perf] Enable FlashInfer DeepGEMM swapAB on SM90 by default (#34924)

Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2026-02-23 20:34:41 -05:00
parent 3ef9fd0f98
commit a4bd661fb3
2 changed files with 7 additions and 2 deletions
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -5,6 +5,7 @@ from collections.abc import Callable
 import pytest

 from vllm.config import PassConfig
+from vllm.utils.flashinfer import is_flashinfer_fp8_blockscale_gemm_supported

 from .common import (
    INDUCTOR_GRAPH_PARTITION,
@@ -50,6 +51,10 @@ def test_tp1_fp8_fusions(
    run_e2e_fusion_test,
    monkeypatch,
 ):
+    if use_deepgemm and is_flashinfer_fp8_blockscale_gemm_supported():
+        # Flashinfer block FP8 GEMM has internal quantization, so it can't
+        # be fused with other ops.
+        pytest.skip("FlashInfer block FP8 GEMM not supported")
    if use_deepgemm and is_blackwell():
        # TODO(luka) DeepGEMM uses different quants, matching not supported
        #  - on Blackwell, uses a special quant fp8, currently not supported