diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index dff167588..f98400c2e 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -5,6 +5,7 @@ from collections.abc import Callable import pytest from vllm.config import PassConfig +from vllm.utils.flashinfer import is_flashinfer_fp8_blockscale_gemm_supported from .common import ( INDUCTOR_GRAPH_PARTITION, @@ -50,6 +51,10 @@ def test_tp1_fp8_fusions( run_e2e_fusion_test, monkeypatch, ): + if use_deepgemm and is_flashinfer_fp8_blockscale_gemm_supported(): + # Flashinfer block FP8 GEMM has internal quantization, so it can't + # be fused with other ops. + pytest.skip("FlashInfer block FP8 GEMM not supported") if use_deepgemm and is_blackwell(): # TODO(luka) DeepGEMM uses different quants, matching not supported # - on Blackwell, uses a special quant fp8, currently not supported diff --git a/vllm/envs.py b/vllm/envs.py index 2b341bd5b..e6b824c56 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -159,7 +159,7 @@ if TYPE_CHECKING: "relax", ] = "relax" VLLM_USE_FUSED_MOE_GROUPED_TOPK: bool = True - VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER: bool = False + VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER: bool = True VLLM_USE_FLASHINFER_MOE_FP16: bool = False VLLM_USE_FLASHINFER_MOE_FP8: bool = False VLLM_USE_FLASHINFER_MOE_FP4: bool = False @@ -1198,7 +1198,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Allow use of FlashInfer FP8 block-scale GEMM for linear layers. # This uses TensorRT-LLM kernels and requires SM90+ (Hopper). "VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER": lambda: bool( - int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "0")) + int(os.getenv("VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER", "1")) ), # Allow use of FlashInfer BF16 MoE kernels for fused moe ops. "VLLM_USE_FLASHINFER_MOE_FP16": lambda: bool(