diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index db8c0348c..12760def7 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1047,6 +1047,47 @@ steps: # Run all e2e fusion tests - pytest -v -s tests/compile/distributed/test_fusions_e2e.py +- label: Hopper Fusion E2E Tests (H100) # 10min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/" + gpu: h100 + optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/test_fusion_attn.py + +- label: Hopper Fusion Distributed E2E Tests (2xH100) # 70min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/" + gpu: h100 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/distributed/test_fusions_e2e.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + # Run all e2e fusion tests + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1346,6 +1387,18 @@ steps: - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 +- label: Sequence Parallel Tests (H100) # 60 min + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: h100 + optional: true + num_gpus: 2 + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + # Run sequence parallel tests + - pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + ##### H200 test ##### - label: Distributed Tests (H200) # optional gpu: h200 @@ -1354,10 +1407,6 @@ steps: num_gpus: 2 commands: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index f8a629ed4..996da4a29 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -3,16 +3,26 @@ from __future__ import annotations -import itertools import logging -from collections.abc import Iterable -from typing import Any, NamedTuple +from typing import Any import pytest import regex as re +from tests.compile.fusion_test_utils import ( + CUSTOM_OPS_FP8, + CUSTOM_OPS_QUANT_RMS_NORM, + CUSTOM_OPS_RMS_NORM, + MODELS, + MODELS_FP4, + MODELS_FP8, + MODELS_GROUP_FP8, + Matches, + custom_ops_product, + is_blackwell, + run_model, +) from tests.v1.attention.utils import AttentionBackendEnum -from vllm import LLM, SamplingParams from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer @@ -20,228 +30,6 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer from ...utils import flat_product, multi_gpu_test -is_blackwell = lambda: current_platform.is_device_capability_family(100) -"""Are we running on Blackwell, a lot of tests depend on it""" - - -class Matches(NamedTuple): - attention_fusion: int = 0 - allreduce_fusion: int = 0 - rms_quant_norm_fusion: int = 0 - sequence_parallel: int = 0 - async_tp: int = 0 - - -class ModelBackendTestCase(NamedTuple): - model_name: str - model_kwargs: dict[str, Any] - backend: AttentionBackendEnum - matches: Matches - - -MODELS_FP8: list[ModelBackendTestCase] = [] -MODELS_FP4: list[ModelBackendTestCase] = [] -MODELS_GROUP_FP8: list[ModelBackendTestCase] = [] -MODELS: list[ModelBackendTestCase] = [] # tp-only - -if current_platform.is_cuda(): - MODELS_FP8 = [ - ModelBackendTestCase( - # Use smaller model for L40s in CI - model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=32, - allreduce_fusion=65, - sequence_parallel=65, - async_tp=128, - ), - ), - ModelBackendTestCase( - model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - # TODO FlashInfer attn broken on Hopper with kvcache=fp8: - # https://github.com/vllm-project/vllm/issues/28568 - backend=AttentionBackendEnum.FLASHINFER - if is_blackwell() - else AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=48, - allreduce_fusion=96, - sequence_parallel=96, - async_tp=95, # mlp is moe, no fusion there - ), - ), - ] - - MODELS_FP4 = [ - ModelBackendTestCase( - model_name="nvidia/Llama-3.1-8B-Instruct-FP4", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER, - matches=Matches( - attention_fusion=32, - allreduce_fusion=65, - sequence_parallel=65, - async_tp=128, - ), - ), - ] - - # TP only - MODELS = [ - ModelBackendTestCase( - model_name="meta-llama/Llama-3.1-8B-Instruct", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=0, - allreduce_fusion=65, - sequence_parallel=65, - async_tp=128, - ), - ), - ModelBackendTestCase( - model_name="Qwen/Qwen3-30B-A3B", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=0, - allreduce_fusion=97, - sequence_parallel=97, - async_tp=96, # MLP is MoE, half the fusions of dense - ), - ), - ] - -elif current_platform.is_rocm(): - MODELS_FP8 = [ - ModelBackendTestCase( - model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches(attention_fusion=32), - ), - ModelBackendTestCase( - model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.ROCM_ATTN, - matches=Matches(attention_fusion=32), - ), - ModelBackendTestCase( - model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN, - matches=Matches(attention_fusion=32), - ), - ] - -CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] - - -def has_cuda_graph_wrapper_metadata() -> bool: - from importlib import import_module - - try: - module = import_module("torch._inductor.utils") - module.CUDAGraphWrapperMetadata # noqa B018 - except AttributeError: - return False - return True - - -@pytest.mark.parametrize( - "model_name, model_kwargs, backend, matches, custom_ops", - # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 - list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8)) - # quant_fp4 only has the custom impl - + list(flat_product(MODELS_FP4, [""])), -) -@pytest.mark.parametrize( - "inductor_graph_partition", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - not has_cuda_graph_wrapper_metadata(), - reason="This test requires" - "torch._inductor.utils.CUDAGraphWrapperMetadata to run", - ), - ), - False, - ], -) -def test_attn_quant( - model_name: str, - model_kwargs: dict[str, Any], - backend: AttentionBackendEnum, - matches: Matches, - custom_ops: str, - inductor_graph_partition: bool, - caplog_mp_spawn, - monkeypatch, -): - if backend == AttentionBackendEnum.FLASHINFER and ( - not is_blackwell() or not has_flashinfer() - ): - pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer") - if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): - pytest.skip("Inductor graph partition requires torch>=2.9") - - custom_ops_list = custom_ops.split(",") if custom_ops else [] - - if inductor_graph_partition: - mode = CUDAGraphMode.FULL_AND_PIECEWISE - splitting_ops: list[str] | None = None - else: - # FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at - # CUDAGraphMode.NONE here because it derives an attention backend that - # does not support full cudagraphs - mode = CUDAGraphMode.FULL_DECODE_ONLY - splitting_ops = [] - - # Disable, compile cache to make sure custom passes run. - # Otherwise, we can't verify fusion happened through the logs. - monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") - - # To capture subprocess logs, we need to know whether spawn or fork is used. - # Force spawn as it is more general. - monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - - model_kwargs["attention_config"] = {"backend": backend.name} - - compilation_config = CompilationConfig( - # Testing properties - custom_ops=custom_ops_list, - use_inductor_graph_partition=inductor_graph_partition, - cudagraph_mode=mode, - splitting_ops=splitting_ops, - # Common - mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True), - # Inductor caches custom passes by default as well via uuid - inductor_compile_config={"force_disable_caches": True}, - ) - - with caplog_mp_spawn(logging.DEBUG) as log_holder: - run_model(compilation_config, model_name, **model_kwargs) - - log_matches = re.findall( - r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", - log_holder.text, - ) - assert len(log_matches) == 1, log_holder.text - assert int(log_matches[0]) == matches.attention_fusion - - -CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"] - - -def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]: - for op_list in itertools.product(*custom_ops_lists): - yield ",".join(op_list) - @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize( @@ -421,7 +209,7 @@ def test_tp2_attn_quant_async_tp( custom_ops=custom_ops_list, splitting_ops=splitting_ops, # Common - level=CompilationMode.VLLM_COMPILE, + mode=CompilationMode.VLLM_COMPILE, pass_config=PassConfig( fuse_attn_quant=True, eliminate_noops=True, @@ -464,62 +252,6 @@ def test_tp2_attn_quant_async_tp( assert int(log_matches[1]) == matches.async_tp -def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): - compilation_config = ( - compile_config - if isinstance(compile_config, CompilationConfig) - else CompilationConfig(mode=compile_config) - ) - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0) - # Allow override from model_kwargs - model_kwargs = {"tensor_parallel_size": 1, **model_kwargs} - model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs} - - # No cudagraphs by default - if compilation_config.cudagraph_mode is None: - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - llm = LLM( - model=model, - compilation_config=compilation_config, - **model_kwargs, - ) - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - # Get the compile ranges split points after vllm config post init - # in order to compute compile ranges correctly - compilation_config.compile_ranges_split_points = ( - llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points - ) - - -if current_platform.is_cuda(): - MODELS_GROUP_FP8 = [ - ModelBackendTestCase( - model_name="Qwen/Qwen3-30B-A3B-FP8", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - rms_quant_norm_fusion=48, - ), - ), - ] - -CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"] - - @pytest.mark.parametrize( "model_name, model_kwargs, backend, matches, custom_ops", # Test rms norm+group quant_fp8 fusion diff --git a/tests/compile/fusion_test_utils.py b/tests/compile/fusion_test_utils.py new file mode 100644 index 000000000..ec7b987bf --- /dev/null +++ b/tests/compile/fusion_test_utils.py @@ -0,0 +1,208 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Shared utilities for fusion tests (e.g. test_fusion_attn.py).""" + +from __future__ import annotations + +import itertools +from collections.abc import Iterable +from typing import Any, NamedTuple + +from tests.v1.attention.utils import AttentionBackendEnum +from vllm import LLM, SamplingParams +from vllm.config import CompilationConfig, CUDAGraphMode +from vllm.platforms import current_platform + +is_blackwell = lambda: current_platform.is_device_capability_family(100) +"""Are we running on Blackwell, a lot of tests depend on it""" + + +def has_cuda_graph_wrapper_metadata() -> bool: + from importlib import import_module + + try: + module = import_module("torch._inductor.utils") + module.CUDAGraphWrapperMetadata # noqa B018 + except AttributeError: + return False + return True + + +class Matches(NamedTuple): + attention_fusion: int = 0 + allreduce_fusion: int = 0 + sequence_parallel: int = 0 + async_tp: int = 0 + rms_quant_norm_fusion: int = 0 + + +class ModelBackendTestCase(NamedTuple): + model_name: str + model_kwargs: dict[str, Any] + backend: AttentionBackendEnum + matches: Matches + + +# E2E model test cases +MODELS_FP8: list[ModelBackendTestCase] = [] +MODELS_FP4: list[ModelBackendTestCase] = [] +MODELS: list[ModelBackendTestCase] = [] # tp-only (unquantized) +MODELS_GROUP_FP8: list[ModelBackendTestCase] = [] + +if current_platform.is_cuda(): + MODELS_FP8 = [ + ModelBackendTestCase( + # Use smaller model for L40s in CI + model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), + backend=AttentionBackendEnum.TRITON_ATTN, + matches=Matches( + attention_fusion=32, + allreduce_fusion=65, + sequence_parallel=65, + async_tp=128, + ), + ), + ModelBackendTestCase( + model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", + model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), + # TODO FlashInfer attn broken on Hopper with kvcache=fp8: + # https://github.com/vllm-project/vllm/issues/28568 + backend=AttentionBackendEnum.FLASHINFER + if is_blackwell() + else AttentionBackendEnum.TRITON_ATTN, + matches=Matches( + attention_fusion=48, + allreduce_fusion=96, + sequence_parallel=96, + async_tp=95, # mlp is moe, no fusion there + ), + ), + ] + + MODELS_FP4 = [ + ModelBackendTestCase( + model_name="nvidia/Llama-3.1-8B-Instruct-FP4", + model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), + backend=AttentionBackendEnum.FLASHINFER, + matches=Matches( + attention_fusion=32, + allreduce_fusion=65, + sequence_parallel=65, + async_tp=128, + ), + ), + ] + + # TP only (unquantized models) + MODELS = [ + ModelBackendTestCase( + model_name="meta-llama/Llama-3.1-8B-Instruct", + model_kwargs=dict(max_model_len=1024), + backend=AttentionBackendEnum.TRITON_ATTN, + matches=Matches( + attention_fusion=0, + allreduce_fusion=65, + sequence_parallel=65, + async_tp=128, + ), + ), + ModelBackendTestCase( + model_name="Qwen/Qwen3-30B-A3B", + model_kwargs=dict(max_model_len=1024), + backend=AttentionBackendEnum.TRITON_ATTN, + matches=Matches( + attention_fusion=0, + allreduce_fusion=97, + sequence_parallel=97, + async_tp=96, # MLP is MoE, half the fusions of dense + ), + ), + ] + + MODELS_GROUP_FP8 = [ + ModelBackendTestCase( + model_name="Qwen/Qwen3-30B-A3B-FP8", + model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), + backend=AttentionBackendEnum.TRITON_ATTN, + matches=Matches( + rms_quant_norm_fusion=48, + ), + ), + ] + +elif current_platform.is_rocm(): + MODELS_FP8 = [ + ModelBackendTestCase( + model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", + model_kwargs=dict(max_model_len=1024), + backend=AttentionBackendEnum.TRITON_ATTN, + matches=Matches(attention_fusion=32), + ), + ModelBackendTestCase( + model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", + model_kwargs=dict(max_model_len=1024), + backend=AttentionBackendEnum.ROCM_ATTN, + matches=Matches(attention_fusion=32), + ), + ModelBackendTestCase( + model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", + model_kwargs=dict(max_model_len=1024), + backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN, + matches=Matches(attention_fusion=32), + ), + ] + + +# Custom ops toggle lists for parametrization +CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] +CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"] +CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"] + + +def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]: + """Generate all combinations of custom ops for parametrization.""" + for op_list in itertools.product(*custom_ops_lists): + yield ",".join(op_list) + + +def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): + """Run a model with the given compilation config for E2E fusion tests.""" + compilation_config = ( + compile_config + if isinstance(compile_config, CompilationConfig) + else CompilationConfig(mode=compile_config) + ) + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0) + # Allow override from model_kwargs + model_kwargs = {"tensor_parallel_size": 1, **model_kwargs} + model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs} + + # No cudagraphs by default + if compilation_config.cudagraph_mode is None: + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + llm = LLM( + model=model, + compilation_config=compilation_config, + **model_kwargs, + ) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Get the compile ranges split points after vllm config post init + # in order to compute compile ranges correctly + compilation_config.compile_ranges_split_points = ( + llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points + ) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index a1fd098ae..ef6978a7f 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -1,12 +1,24 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy +import logging +from typing import Any import pytest +import regex as re import torch._dynamo from tests.compile.backend import LazyInitPass, TestBackend -from tests.utils import flat_product +from tests.compile.fusion_test_utils import ( + CUSTOM_OPS_FP8, + MODELS_FP4, + MODELS_FP8, + Matches, + has_cuda_graph_wrapper_metadata, + is_blackwell, + run_model, +) +from tests.utils import cuda_device_count_stateless, flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.attention.layer import Attention @@ -20,6 +32,7 @@ from vllm.config import ( CacheConfig, CompilationConfig, CompilationMode, + CUDAGraphMode, ModelConfig, PassConfig, SchedulerConfig, @@ -35,6 +48,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer +from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.kv_cache_interface import AttentionSpec @@ -241,8 +255,8 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel): ) -MODELS_FP8: list[tuple[str, type]] = [] -MODELS_FP4: list[tuple[str, type]] = [] +PATTERN_TEST_MODELS_FP8: list[tuple[str, type]] = [] +PATTERN_TEST_MODELS_FP4: list[tuple[str, type]] = [] HEADS: list[tuple[int, int]] = [] SPLIT_ATTENTION: list[bool] = [] BACKENDS_FP8: list[AttentionBackendEnum] = [] @@ -250,13 +264,13 @@ BACKENDS_FP4: list[AttentionBackendEnum] = [] if current_platform.is_cuda(): HEADS = [(64, 8), (40, 8)] - MODELS_FP8 = [ + PATTERN_TEST_MODELS_FP8 = [ ( "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", TestAttentionFp8StaticQuantPatternModel, ) ] - MODELS_FP4 = [ + PATTERN_TEST_MODELS_FP4 = [ ( "nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", TestAttentionNvfp4QuantPatternModel, @@ -267,7 +281,7 @@ if current_platform.is_cuda(): elif current_platform.is_rocm(): HEADS = [(32, 8), (40, 8)] - MODELS_FP8 = [ + PATTERN_TEST_MODELS_FP8 = [ ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel) ] BACKENDS = [ @@ -286,9 +300,13 @@ elif current_platform.is_rocm(): @pytest.mark.parametrize( "backend, model_name, model_class, custom_ops", # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 - list(flat_product(BACKENDS_FP8, MODELS_FP8, ["+quant_fp8", "-quant_fp8"])) + list( + flat_product( + BACKENDS_FP8, PATTERN_TEST_MODELS_FP8, ["+quant_fp8", "-quant_fp8"] + ) + ) # quant_fp4 only has the custom impl - + list(flat_product(BACKENDS_FP4, MODELS_FP4, [""])), + + list(flat_product(BACKENDS_FP4, PATTERN_TEST_MODELS_FP4, [""])), ) @pytest.mark.skipif( not current_platform.is_cuda_alike(), reason="Only test ROCm or CUDA" @@ -315,6 +333,8 @@ def test_attention_quant_pattern( not current_platform.is_device_capability((10, 0)) or not has_flashinfer() ): pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer") + if "Llama-4-Scout" in model_name and cuda_device_count_stateless() < 2: + pytest.skip("Llama-4-Scout requires at least 2 GPUs") custom_ops_list = custom_ops.split(",") if custom_ops else [] @@ -483,3 +503,88 @@ def test_attention_quant_pattern( # Check that results are close torch.testing.assert_close(result_unfused, result_fused, atol=1e-2, rtol=1e-2) + + +@pytest.mark.parametrize( + "model_name, model_kwargs, backend, matches, custom_ops", + # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 + list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8)) + # quant_fp4 only has the custom impl + + list(flat_product(MODELS_FP4, [""])), +) +@pytest.mark.parametrize( + "inductor_graph_partition", + [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_cuda_graph_wrapper_metadata(), + reason="This test requires" + "torch._inductor.utils.CUDAGraphWrapperMetadata to run", + ), + ), + False, + ], +) +def test_attn_quant( + model_name: str, + model_kwargs: dict[str, Any], + backend: AttentionBackendEnum, + matches: Matches, + custom_ops: str, + inductor_graph_partition: bool, + caplog_mp_spawn, + monkeypatch, +): + if not current_platform.has_device_capability(90): + pytest.skip("test_attn_quant requires H100 (SM90) or B200 (SM100) GPU") + if backend == AttentionBackendEnum.FLASHINFER and ( + not is_blackwell() or not has_flashinfer() + ): + pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer") + if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): + pytest.skip("Inductor graph partition requires torch>=2.9") + + custom_ops_list = custom_ops.split(",") if custom_ops else [] + + if inductor_graph_partition: + mode = CUDAGraphMode.FULL_AND_PIECEWISE + splitting_ops: list[str] | None = None + else: + # FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at + # CUDAGraphMode.NONE here because it derives an attention backend that + # does not support full cudagraphs + mode = CUDAGraphMode.FULL_DECODE_ONLY + splitting_ops = [] + + # Disable, compile cache to make sure custom passes run. + # Otherwise, we can't verify fusion happened through the logs. + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + # To capture subprocess logs, we need to know whether spawn or fork is used. + # Force spawn as it is more general. + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + model_kwargs["attention_config"] = {"backend": backend.name} + + compilation_config = CompilationConfig( + # Testing properties + custom_ops=custom_ops_list, + use_inductor_graph_partition=inductor_graph_partition, + cudagraph_mode=mode, + splitting_ops=splitting_ops, + # Common + mode=CompilationMode.VLLM_COMPILE, + pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True), + # Inductor caches custom passes by default as well via uuid + inductor_compile_config={"force_disable_caches": True}, + ) + + with caplog_mp_spawn(logging.DEBUG) as log_holder: + run_model(compilation_config, model_name, **model_kwargs) + + log_matches = re.findall( + r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", + log_holder.text, + ) + assert len(log_matches) == 1, log_holder.text + assert int(log_matches[0]) == matches.attention_fusion diff --git a/vllm/env_override.py b/vllm/env_override.py index 474ac6991..8b2f2424d 100644 --- a/vllm/env_override.py +++ b/vllm/env_override.py @@ -95,7 +95,7 @@ def memory_plan_reuse_patched(self): # =================================================== # This change monkeypatches get_graph_partition_signature in pytorch 2.9.0 to # fix inductor partition + attention-nvfp4 quant fusion, tested in -# `tests/compile/distributed/test_fusions_e2e.py::test_attn_quant`. +# `tests/compile/test_fusion_attn.py::test_attn_quant`. # For more context, see https://github.com/pytorch/pytorch/pull/165815.