[Perf] Enable FlashInfer DeepGEMM swapAB on SM90 by default (#34924)
Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
@@ -5,6 +5,7 @@ from collections.abc import Callable
|
||||
import pytest
|
||||
|
||||
from vllm.config import PassConfig
|
||||
from vllm.utils.flashinfer import is_flashinfer_fp8_blockscale_gemm_supported
|
||||
|
||||
from .common import (
|
||||
INDUCTOR_GRAPH_PARTITION,
|
||||
@@ -50,6 +51,10 @@ def test_tp1_fp8_fusions(
|
||||
run_e2e_fusion_test,
|
||||
monkeypatch,
|
||||
):
|
||||
if use_deepgemm and is_flashinfer_fp8_blockscale_gemm_supported():
|
||||
# Flashinfer block FP8 GEMM has internal quantization, so it can't
|
||||
# be fused with other ops.
|
||||
pytest.skip("FlashInfer block FP8 GEMM not supported")
|
||||
if use_deepgemm and is_blackwell():
|
||||
# TODO(luka) DeepGEMM uses different quants, matching not supported
|
||||
# - on Blackwell, uses a special quant fp8, currently not supported
|
||||
|
||||
Reference in New Issue
Block a user