diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index ee7c6ab0a..0050c615a 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -604,9 +604,11 @@ steps: - tests/compile commands: - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - # Limit to no custom ops to reduce running time - # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + # # Limit to no custom ops to reduce running time + # # Wrap with quotes to escape yaml and avoid starting -k string with a - + # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - label: Cudagraph test timeout_in_minutes: 20 @@ -1181,7 +1183,6 @@ steps: - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py - - tests/compile/distributed/test_fusions_e2e.py - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi @@ -1189,33 +1190,16 @@ steps: - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + + # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # # Wrap with quotes to escape yaml + # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile -- label: Blackwell Fusion E2E Tests # 30 min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - gpu: b200 - optional: true - num_gpus: 2 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/distributed/test_fusions_e2e.py - commands: - - nvidia-smi - # Run all e2e fusion tests - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1566,7 +1550,10 @@ steps: - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bcd9997a4..554081f53 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -537,9 +537,11 @@ steps: commands: # fp8 kv scales not supported on sm89, tested on Blackwell instead - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - # Limit to no custom ops to reduce running time - # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + # # Limit to no custom ops to reduce running time + # # Wrap with quotes to escape yaml and avoid starting -k string with a - + # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - label: Cudagraph test timeout_in_minutes: 20 @@ -1069,7 +1071,6 @@ steps: - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py - - tests/compile/distributed/test_fusions_e2e.py - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi @@ -1077,75 +1078,15 @@ steps: - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # # Wrap with quotes to escape yaml + # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 + # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile -- label: Blackwell Fusion E2E Tests # 30 min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - gpu: b200 - optional: true - num_gpus: 2 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/distributed/test_fusions_e2e.py - commands: - - nvidia-smi - # Run all e2e fusion tests - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py - -- label: Hopper Fusion E2E Tests (H100) # 10min - timeout_in_minutes: 70 - working_dir: "/vllm-workspace/" - gpu: h100 - optional: true - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusion_attn.py - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - # skip Llama-4 since it does not fit on this device - - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4' - -- label: Hopper Fusion Distributed E2E Tests (2xH100) # 70min - timeout_in_minutes: 70 - working_dir: "/vllm-workspace/" - gpu: h100 - optional: true - num_gpus: 2 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/distributed/test_fusions_e2e.py - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - # Run all e2e fusion tests - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml index 3c6f82fdd..e8cf9e8bd 100644 --- a/.buildkite/test_areas/compile.yaml +++ b/.buildkite/test_areas/compile.yaml @@ -2,56 +2,196 @@ group: Compile depends_on: - image-build steps: -- label: Fusion and Compile Tests (B200) +- label: Sequence Parallel Tests (2 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + num_devices: 2 + source_file_dependencies: + - vllm/model_executor/layers/ + - vllm/compilation/ + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - tests/distributed/test_sequence_parallel.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/distributed/test_sequence_parallel.py + +- label: Sequence Parallel Tests (2xH100) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + device: h100 + optional: true + num_devices: 2 + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/distributed/test_sequence_parallel.py + +- label: Distributed Compile Unit Tests (2xH100) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" + device: h100 + num_devices: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/model_executor/layers + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_sequence_parallelism.py + - tests/compile/distributed/test_async_tp.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_async_tp.py + +- label: Fusion and Compile Unit Tests (B200) + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/" device: b200 source_file_dependencies: - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - - vllm/compilation/ - # can affect pattern matching + - vllm/model_executor/layers/quantization/ - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py + - vllm/model_executor/layers/attention/attention.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py - - tests/compile/distributed/test_fusions_e2e.py - tests/compile/fullgraph/test_full_graph.py commands: + # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell - nvidia-smi - - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_devices=2 is not set - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py - # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # Wrap with quotes to escape yaml - - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + # TODO(luka) move to H100 once pass tests run on H100 - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile -- label: Fusion E2E (2 GPUs)(B200) - timeout_in_minutes: 40 +- label: Fusion E2E Quick (H100) + timeout_in_minutes: 15 working_dir: "/vllm-workspace/" - device: b200 - optional: true - num_devices: 2 + device: h100 + num_devices: 1 source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/distributed/test_fusions_e2e.py + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ commands: - nvidia-smi - # Run all e2e fusion tests - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" +- label: Fusion E2E Config Sweep (H100) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + device: h100 + num_devices: 1 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - nvidia-smi + # Run just llama3 (fp8) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" + +- label: Fusion E2E Config Sweep (B200) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + device: b200 + num_devices: 1 + optional: true + commands: + - nvidia-smi + # Run all models and attn backends but only Inductor partition and native custom ops + # -k "inductor_partition and not +rms_norm and not +quant_fp8" + # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported + # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" + # Run just llama3 (fp8 & fp4) for all config combinations + # -k "llama-3" + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" + +- label: Fusion E2E TP2 Quick (H100) + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/" + device: h100 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - nvidia-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + +- label: Fusion E2E TP2 AR-RMS Config Sweep (H100) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: h100 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - nvidia-smi + # Run just llama3 (fp4 & fp8 & bf16) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3" + +- label: Fusion E2E TP2 AsyncTP Config Sweep (H100) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: h100 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - nvidia-smi + # Run just llama3 (fp8 & bf16) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" + +- label: Fusion E2E TP2 (B200) + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/" + device: b200 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - nvidia-smi + # Run all models and attn backends but only Inductor partition and native custom ops + # for ar-rms-quant-fp4, also sweep llama3 + - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4" + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 51e1de3f0..ae4f45fbf 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -16,7 +16,7 @@ steps: - pytest -v -s distributed/test_shm_storage.py - label: Distributed (2 GPUs) - timeout_in_minutes: 90 + timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: @@ -47,7 +47,6 @@ steps: - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py @@ -133,25 +132,13 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py -- label: Sequence Parallel Tests (H100) - timeout_in_minutes: 60 - working_dir: "/vllm-workspace/" - device: h100 - optional: true - num_devices: 2 - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - # Run sequence parallel tests - - pytest -v -s tests/distributed/test_sequence_parallel.py - - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py - - label: Distributed Tests (2 GPUs)(H100) + timeout_in_minutes: 15 device: h100 optional: true working_dir: "/vllm-workspace/" num_devices: 2 commands: - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py @@ -217,45 +204,3 @@ steps: commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py - -- label: Hopper Fusion E2E Tests (H100) - timeout_in_minutes: 70 - working_dir: "/vllm-workspace/" - device: h100 - optional: true - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/test_fusion_attn.py - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - # skip Llama-4 since it does not fit on this device - - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4' - -- label: Hopper Fusion Distributed E2E Tests (2xH100) - timeout_in_minutes: 70 - working_dir: "/vllm-workspace/" - device: h100 - optional: true - num_devices: 2 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/distributed/test_fusions_e2e.py - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - # Run all e2e fusion tests - - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' - - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml index 332d5202d..1ac3eec58 100644 --- a/.buildkite/test_areas/pytorch.yaml +++ b/.buildkite/test_areas/pytorch.yaml @@ -18,7 +18,7 @@ steps: - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test - timeout_in_minutes: 30 + timeout_in_minutes: 35 source_file_dependencies: - vllm/ - tests/compile @@ -30,16 +30,13 @@ steps: - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;" - label: PyTorch Fullgraph - timeout_in_minutes: 40 + timeout_in_minutes: 30 source_file_dependencies: - vllm/ - tests/compile commands: # fp8 kv scales not supported on sm89, tested on Blackwell instead - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' - # Limit to no custom ops to reduce running time - # Wrap with quotes to escape yaml and avoid starting -k string with a - - - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py deleted file mode 100644 index b9913734d..000000000 --- a/tests/compile/distributed/test_fusions_e2e.py +++ /dev/null @@ -1,321 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from __future__ import annotations - -import logging -from typing import Any - -import pytest -import regex as re - -from tests.compile.fusion_test_utils import ( - CUSTOM_OPS_FP8, - CUSTOM_OPS_QUANT_RMS_NORM, - CUSTOM_OPS_RMS_NORM, - MODELS, - MODELS_FP4, - MODELS_FP8, - MODELS_GROUP_FP8, - Matches, - custom_ops_product, - is_blackwell, - run_model, -) -from tests.v1.attention.utils import AttentionBackendEnum -from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig -from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer -from vllm.utils.torch_utils import is_torch_equal_or_newer - -from ...utils import flat_product, multi_gpu_test - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "model_name, model_kwargs, backend, matches, custom_ops", - # Toggle RMSNorm and QuantFP8 for FP8 models - list( - flat_product( - MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM) - ) - ) - # Toggle RMSNorm for FP4 models and unquant models - + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)), -) -@pytest.mark.parametrize("inductor_graph_partition", [True, False]) -@pytest.mark.skipif( - not current_platform.is_cuda() - or not has_flashinfer() - or not current_platform.has_device_capability(90), - reason="allreduce+rmsnorm fusion requires flashinfer", -) -def test_tp2_attn_quant_allreduce_rmsnorm( - model_name: str, - model_kwargs: dict, - backend: AttentionBackendEnum, - matches: Matches, - custom_ops: str, - inductor_graph_partition: bool, - caplog_mp_spawn, - monkeypatch, -): - if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): - pytest.skip("Inductor graph partition requires torch>=2.9") - - if "fp4" in model_name.lower() and not is_blackwell(): - pytest.skip("NVFP4 quant requires Blackwell") - - if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell(): - # FlashInfer attn fusion requires Blackwell - matches = matches._replace(attention_fusion=0) - - custom_ops_list = custom_ops.split(",") if custom_ops else [] - - if inductor_graph_partition: - mode = CUDAGraphMode.FULL_AND_PIECEWISE - splitting_ops: list[str] | None = None - else: - mode = CUDAGraphMode.FULL_DECODE_ONLY - splitting_ops = [] - - # Disable, compile cache to make sure custom passes run. - # Otherwise, we can't verify fusion happened through the logs. - monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") - - # To capture subprocess logs, we need to know whether spawn or fork is used. - # Force spawn as it is more general. - monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - - model_kwargs["attention_config"] = {"backend": backend.name} - - compilation_config = CompilationConfig( - # Testing properties - use_inductor_graph_partition=inductor_graph_partition, - cudagraph_mode=mode, - custom_ops=custom_ops_list, - splitting_ops=splitting_ops, - # Common - mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig( - fuse_attn_quant=True, - eliminate_noops=True, - fuse_allreduce_rms=True, - ), - # Inductor caches custom passes by default as well via uuid - inductor_compile_config={"force_disable_caches": True}, - ) - - with caplog_mp_spawn(logging.DEBUG) as log_holder: - run_model( - compilation_config, model_name, tensor_parallel_size=2, **model_kwargs - ) - log_matches = re.findall( - r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", - log_holder.text, - ) - # 2 for each compile range - # (global compile range can be split due to fuse_allreduce_rmsnorm) - num_compile_ranges = len(compilation_config.get_compile_ranges()) - assert num_compile_ranges in [1, 2] - - assert len(log_matches) == 2 * num_compile_ranges, log_holder.text - - assert all(int(log_match) == matches.attention_fusion for log_match in log_matches) - - log_matches = re.findall( - r"collective_fusion.py:\d+] Replaced (\d+) patterns", - log_holder.text, - ) - assert len(log_matches) == 2, log_holder.text - - assert int(log_matches[0]) == matches.allreduce_fusion - assert int(log_matches[1]) == matches.allreduce_fusion - - log_matches = re.findall( - r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range", - log_holder.text, - ) - assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text - - -@multi_gpu_test(num_gpus=2) -@pytest.mark.parametrize( - "model_name, model_kwargs, backend, matches, custom_ops", - # Toggle RMSNorm and QuantFP8 for FP8 models - list( - flat_product( - MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM) - ) - ) - # Toggle RMSNorm for FP4 models and unquant models - + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)), -) -@pytest.mark.parametrize("inductor_graph_partition", [True, False]) -@pytest.mark.skipif( - not current_platform.is_cuda(), - reason="sequence parallel only tested on CUDA", -) -def test_tp2_attn_quant_async_tp( - model_name: str, - model_kwargs: dict, - backend: AttentionBackendEnum, - matches: Matches, - custom_ops: str, - inductor_graph_partition: bool, - caplog_mp_spawn, - monkeypatch, -): - if is_blackwell(): - # TODO: https://github.com/vllm-project/vllm/issues/27893 - pytest.skip("Blackwell is not supported for AsyncTP pass") - - if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): - pytest.skip("Inductor graph partition requires torch>=2.9") - - if "fp4" in model_name.lower() and not is_blackwell(): - pytest.skip("NVFP4 quant requires Blackwell") - - if backend == AttentionBackendEnum.FLASHINFER: - if not has_flashinfer(): - pytest.skip("FlashInfer backend requires flashinfer installed") - if not is_blackwell(): - # FlashInfer attn fusion requires Blackwell - matches = matches._replace(attention_fusion=0) - - custom_ops_list = custom_ops.split(",") if custom_ops else [] - - if inductor_graph_partition: - mode = CUDAGraphMode.FULL_AND_PIECEWISE - splitting_ops: list[str] | None = None - else: - mode = CUDAGraphMode.FULL_DECODE_ONLY - splitting_ops = [] - - # Disable, compile cache to make sure custom passes run. - # Otherwise, we can't verify fusion happened through the logs. - monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") - - # To capture subprocess logs, we need to know whether spawn or fork is used. - # Force spawn as it is more general. - monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - - model_kwargs["attention_config"] = {"backend": backend.name} - - compilation_config = CompilationConfig( - # Testing properties - use_inductor_graph_partition=inductor_graph_partition, - cudagraph_mode=mode, - custom_ops=custom_ops_list, - splitting_ops=splitting_ops, - # Common - mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig( - fuse_attn_quant=True, - eliminate_noops=True, - enable_sp=True, - fuse_gemm_comms=True, - ), - # Inductor caches custom passes by default as well via uuid - inductor_compile_config={"force_disable_caches": True}, - ) - - with caplog_mp_spawn(logging.DEBUG) as log_holder: - run_model( - compilation_config, model_name, tensor_parallel_size=2, **model_kwargs - ) - log_matches = re.findall( - r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", - log_holder.text, - ) - assert len(log_matches) == 2, log_holder.text - - assert int(log_matches[0]) == matches.attention_fusion - assert int(log_matches[1]) == matches.attention_fusion - - log_matches = re.findall( - r"sequence_parallelism.py:\d+] Replaced (\d+) patterns", - log_holder.text, - ) - assert len(log_matches) == 2, log_holder.text - - assert int(log_matches[0]) == matches.sequence_parallel - assert int(log_matches[1]) == matches.sequence_parallel - - log_matches = re.findall( - r"collective_fusion.py:\d+] Replaced (\d+) patterns", - log_holder.text, - ) - assert len(log_matches) == 2, log_holder.text - - assert int(log_matches[0]) == matches.async_tp - assert int(log_matches[1]) == matches.async_tp - - -@pytest.mark.parametrize( - "model_name, model_kwargs, backend, matches, custom_ops", - # Test rms norm+group quant_fp8 fusion - list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)), -) -@pytest.mark.parametrize("inductor_graph_partition", [True, False]) -# TODO: remove skip after we fix the fusion thoroughly -@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell") -def test_rms_group_quant( - model_name: str, - model_kwargs: dict[str, Any], - backend: AttentionBackendEnum, - matches: Matches, - custom_ops: str, - inductor_graph_partition: bool, - caplog_mp_spawn, - monkeypatch, -): - if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): - pytest.skip("Inductor graph partition requires torch>=2.9") - - custom_ops_list = custom_ops.split(",") if custom_ops else [] - - if inductor_graph_partition: - mode = CUDAGraphMode.FULL_AND_PIECEWISE - splitting_ops: list[str] | None = None - else: - mode = CUDAGraphMode.FULL_DECODE_ONLY - splitting_ops = [] - - # Disable, compile cache to make sure custom passes run. - # Otherwise, we can't verify fusion happened through the logs. - monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") - - # To capture subprocess logs, we need to know whether spawn or fork is used. - # Force spawn as it is more general. - monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - - # TODO: remove this after fusion is fixed - monkeypatch.setenv("VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES", "0") - - model_kwargs["attention_config"] = {"backend": backend.name} - - compilation_config = CompilationConfig( - # Testing properties - custom_ops=custom_ops_list, - use_inductor_graph_partition=inductor_graph_partition, - cudagraph_mode=mode, - splitting_ops=splitting_ops, - # Common - mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig( - fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True - ), - # Inductor caches custom passes by default as well via uuid - inductor_compile_config={"force_disable_caches": True}, - ) - - with caplog_mp_spawn(logging.DEBUG) as log_holder: - run_model(compilation_config, model_name, **model_kwargs) - - log_matches = re.findall( - r"\[fusion.py:\d+] Replaced (\d+) patterns", - log_holder.text, - ) - assert len(log_matches) == 1, log_holder.text - assert int(log_matches[0]) == matches.rms_quant_norm_fusion diff --git a/tests/compile/fusion_test_utils.py b/tests/compile/fusion_test_utils.py deleted file mode 100644 index ec7b987bf..000000000 --- a/tests/compile/fusion_test_utils.py +++ /dev/null @@ -1,208 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Shared utilities for fusion tests (e.g. test_fusion_attn.py).""" - -from __future__ import annotations - -import itertools -from collections.abc import Iterable -from typing import Any, NamedTuple - -from tests.v1.attention.utils import AttentionBackendEnum -from vllm import LLM, SamplingParams -from vllm.config import CompilationConfig, CUDAGraphMode -from vllm.platforms import current_platform - -is_blackwell = lambda: current_platform.is_device_capability_family(100) -"""Are we running on Blackwell, a lot of tests depend on it""" - - -def has_cuda_graph_wrapper_metadata() -> bool: - from importlib import import_module - - try: - module = import_module("torch._inductor.utils") - module.CUDAGraphWrapperMetadata # noqa B018 - except AttributeError: - return False - return True - - -class Matches(NamedTuple): - attention_fusion: int = 0 - allreduce_fusion: int = 0 - sequence_parallel: int = 0 - async_tp: int = 0 - rms_quant_norm_fusion: int = 0 - - -class ModelBackendTestCase(NamedTuple): - model_name: str - model_kwargs: dict[str, Any] - backend: AttentionBackendEnum - matches: Matches - - -# E2E model test cases -MODELS_FP8: list[ModelBackendTestCase] = [] -MODELS_FP4: list[ModelBackendTestCase] = [] -MODELS: list[ModelBackendTestCase] = [] # tp-only (unquantized) -MODELS_GROUP_FP8: list[ModelBackendTestCase] = [] - -if current_platform.is_cuda(): - MODELS_FP8 = [ - ModelBackendTestCase( - # Use smaller model for L40s in CI - model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=32, - allreduce_fusion=65, - sequence_parallel=65, - async_tp=128, - ), - ), - ModelBackendTestCase( - model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - # TODO FlashInfer attn broken on Hopper with kvcache=fp8: - # https://github.com/vllm-project/vllm/issues/28568 - backend=AttentionBackendEnum.FLASHINFER - if is_blackwell() - else AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=48, - allreduce_fusion=96, - sequence_parallel=96, - async_tp=95, # mlp is moe, no fusion there - ), - ), - ] - - MODELS_FP4 = [ - ModelBackendTestCase( - model_name="nvidia/Llama-3.1-8B-Instruct-FP4", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER, - matches=Matches( - attention_fusion=32, - allreduce_fusion=65, - sequence_parallel=65, - async_tp=128, - ), - ), - ] - - # TP only (unquantized models) - MODELS = [ - ModelBackendTestCase( - model_name="meta-llama/Llama-3.1-8B-Instruct", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=0, - allreduce_fusion=65, - sequence_parallel=65, - async_tp=128, - ), - ), - ModelBackendTestCase( - model_name="Qwen/Qwen3-30B-A3B", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - attention_fusion=0, - allreduce_fusion=97, - sequence_parallel=97, - async_tp=96, # MLP is MoE, half the fusions of dense - ), - ), - ] - - MODELS_GROUP_FP8 = [ - ModelBackendTestCase( - model_name="Qwen/Qwen3-30B-A3B-FP8", - model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches( - rms_quant_norm_fusion=48, - ), - ), - ] - -elif current_platform.is_rocm(): - MODELS_FP8 = [ - ModelBackendTestCase( - model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.TRITON_ATTN, - matches=Matches(attention_fusion=32), - ), - ModelBackendTestCase( - model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.ROCM_ATTN, - matches=Matches(attention_fusion=32), - ), - ModelBackendTestCase( - model_name="amd/Llama-3.1-8B-Instruct-FP8-KV", - model_kwargs=dict(max_model_len=1024), - backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN, - matches=Matches(attention_fusion=32), - ), - ] - - -# Custom ops toggle lists for parametrization -CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] -CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"] -CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"] - - -def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]: - """Generate all combinations of custom ops for parametrization.""" - for op_list in itertools.product(*custom_ops_lists): - yield ",".join(op_list) - - -def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): - """Run a model with the given compilation config for E2E fusion tests.""" - compilation_config = ( - compile_config - if isinstance(compile_config, CompilationConfig) - else CompilationConfig(mode=compile_config) - ) - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0) - # Allow override from model_kwargs - model_kwargs = {"tensor_parallel_size": 1, **model_kwargs} - model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs} - - # No cudagraphs by default - if compilation_config.cudagraph_mode is None: - compilation_config.cudagraph_mode = CUDAGraphMode.NONE - llm = LLM( - model=model, - compilation_config=compilation_config, - **model_kwargs, - ) - outputs = llm.generate(prompts, sampling_params) - - # Print the outputs. - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - - # Get the compile ranges split points after vllm config post init - # in order to compute compile ranges correctly - compilation_config.compile_ranges_split_points = ( - llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points - ) diff --git a/tests/compile/fusions_e2e/__init__.py b/tests/compile/fusions_e2e/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/compile/fusions_e2e/common.py b/tests/compile/fusions_e2e/common.py new file mode 100644 index 000000000..d950bf5b6 --- /dev/null +++ b/tests/compile/fusions_e2e/common.py @@ -0,0 +1,102 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools +from collections.abc import Callable, Iterable +from typing import Any, NamedTuple + +import pytest +import regex as re + +from vllm.platforms import current_platform +from vllm.v1.attention.backends.registry import AttentionBackendEnum + + +class Matches(NamedTuple): + # simple pointwise + rms_quant_fusion: int = 0 + act_quant_fusion: int = 0 + norm_rope_fusion: int = 0 + attn_quant_fusion: int = 0 + # distributed + ar_rms_fusion: int = 0 + sequence_parallel: int = 0 + async_tp: int = 0 + + +class ModelFusionInfo(NamedTuple): + model_name: str + matches: Callable[[int], Matches] + """Given number of hidden layers, produces the matches object""" + model_kwargs: dict[str, Any] = {} + hf_overrides: Callable[[int], dict] = lambda n: {"num_hidden_layers": n} + + +class AttentionBackendCase(NamedTuple): + backend: AttentionBackendEnum + model_kwargs: dict[str, Any] = {} + """Additional args required for attn+quant fusion""" + + +is_blackwell = lambda: current_platform.is_device_capability_family(100) +"""Are we running on Blackwell, a lot of tests depend on it""" + + +def custom_ops_combos(*custom_ops: str) -> Iterable[str]: + """Generate all combinations of custom ops for parametrization.""" + custom_ops_lists = [[f"-{op}", f"+{op}"] for op in custom_ops] + for op_list in itertools.product(*custom_ops_lists): + yield ",".join(op_list) + + +# Quick inline validation +assert list(custom_ops_combos("silu_and_mul")) == ["-silu_and_mul", "+silu_and_mul"] +assert list(custom_ops_combos("quant_fp8", "rms_norm")) == [ + "-quant_fp8,-rms_norm", + "-quant_fp8,+rms_norm", + "+quant_fp8,-rms_norm", + "+quant_fp8,+rms_norm", +] + + +def has_cuda_graph_wrapper_metadata() -> bool: + from importlib import import_module + + try: + module = import_module("torch._inductor.utils") + module.CUDAGraphWrapperMetadata # noqa B018 + except AttributeError: + return False + return True + + +INDUCTOR_GRAPH_PARTITION = [ + pytest.param( + True, + marks=pytest.mark.skipif( + not has_cuda_graph_wrapper_metadata(), + reason="torch version does not support Inductor partition", + ), + id="inductor_partition", + ), + pytest.param(False, id="dynamo_partition"), +] + +FUSION_LOG_PATTERNS: dict[str, re.Pattern] = { + "rms_quant_fusion": re.compile( + r"\[(?:compilation/)?fusion.py:\d+] Replaced (\d+) patterns" + ), + "act_quant_fusion": re.compile( + r"activation_quant_fusion.py:\d+] Replaced (\d+) patterns" + ), + "norm_rope_fusion": re.compile( + r"qk_norm_rope_fusion.py:\d+] Fused QK Norm\+RoPE on (\d+) sites" + ), + "attn_quant_fusion": re.compile( + r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes" + ), + "ar_rms_fusion": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"), + "sequence_parallel": re.compile( + r"sequence_parallelism.py:\d+] Replaced (\d+) patterns" + ), + "async_tp": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"), +} diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py new file mode 100644 index 000000000..1d9f6cda9 --- /dev/null +++ b/tests/compile/fusions_e2e/conftest.py @@ -0,0 +1,158 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging + +import pytest +import regex as re + +from vllm import LLM, SamplingParams +from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode + +from .common import FUSION_LOG_PATTERNS, AttentionBackendCase, Matches + + +def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): + """Run a model with the given compilation config for E2E fusion tests.""" + compilation_config = ( + compile_config + if isinstance(compile_config, CompilationConfig) + else CompilationConfig(mode=compile_config) + ) + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0) + # Allow override from model_kwargs + model_kwargs = {"tensor_parallel_size": 1, **model_kwargs} + model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs} + + # No cudagraphs by default + if compilation_config.cudagraph_mode is None: + compilation_config.cudagraph_mode = CUDAGraphMode.NONE + llm = LLM( + model=model, + compilation_config=compilation_config, + **model_kwargs, + ) + outputs = llm.generate(prompts, sampling_params) + + # Print the outputs. + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + + # Get the compile ranges split points after vllm config post init + # in order to compute compile ranges correctly + compilation_config.compile_ranges_split_points = ( + llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points + ) + + +@pytest.fixture +def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn): + def run( + model_name: str, + matches: Matches, + model_kwargs: dict, + attn_backend: AttentionBackendCase, + compilation_config: dict, + matches_check: list[str], + use_deepgemm: bool = False, + tp_size: int = 1, + ): + monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0") + + # Disable, compile cache to make sure custom passes run. + # Otherwise, we can't verify fusion happened through the logs. + monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") + + # To capture subprocess logs, we need to know whether spawn or fork is used. + # Force spawn as it is more general. + monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") + + model_kwargs = {**attn_backend.model_kwargs, **model_kwargs} + model_kwargs["attention_config"] = {"backend": attn_backend.backend.name} + model_kwargs["tensor_parallel_size"] = tp_size + + # Always compile the full graph instead of piecewise + if not compilation_config["use_inductor_graph_partition"]: + compilation_config["splitting_ops"] = [] + + full_compilation_config = CompilationConfig( + cudagraph_mode=CUDAGraphMode.NONE, + mode=CompilationMode.VLLM_COMPILE, + inductor_compile_config={"force_disable_caches": True}, + **compilation_config, + ) + + with caplog_mp_spawn(logging.DEBUG) as log_holder: + run_model(full_compilation_config, model_name, **model_kwargs) + + num_compile_ranges = len(full_compilation_config.get_compile_ranges()) + assert num_compile_ranges in [1, 2] + + print(f"Compile ranges: {full_compilation_config.get_compile_ranges()}") + print("Fusion results:") + + # Iterate through all so printing happens before asserting + log_matches_dict = {} + for match_name, pattern in FUSION_LOG_PATTERNS.items(): + log_matches_dict[match_name] = list(pattern.findall(log_holder.text)) + print(f"- {match_name}={','.join(log_matches_dict[match_name])}") + + # Now check the matches + for match_name in matches_check: + num_ranges_activated = ( + 1 if match_name == "ar_rms_fusion" else num_compile_ranges + ) + n_expected = tp_size * num_ranges_activated + + log_matches = list(int(ms) for ms in log_matches_dict[match_name]) + assert len(log_matches) == n_expected, ( + f"Could not find {n_expected} {match_name} " + f"(found {len(log_matches)}) in:\n {log_holder.text}" + ) + + expected_matches = getattr(matches, match_name) + + if match_name == "rms_quant_fusion" and "ar_rms_fusion" in matches_check: + # AR+rms+quant takes precedence over rms+quant if activated. + # That means we get full matching where ar+rms+quant was not activated, + # and less where it was + assert sum(m == expected_matches for m in log_matches) == tp_size * ( + num_ranges_activated - 1 + ), "Expecting full rms+quant fusion where ar+rms+quant not activated" + + assert all( + expected_matches - matches.ar_rms_fusion <= m <= expected_matches + for m in log_matches + ), ( + f"Expecting at least {expected_matches - matches.ar_rms_fusion} " + f"where ar+rms+quant was activated" + ) + else: + expected_matches_list = [expected_matches] * n_expected + assert sorted(log_matches) == expected_matches_list, ( + f"{match_name} expected: {expected_matches_list}, " + f"found: {sorted(log_matches)}" + ) + + if match_name == "ar_rms_fusion": + log_matches = re.findall( + r"pass_manager.py:\d+] Skipping " + r".*AllReduceFusionPass.* with compile range", + log_holder.text, + ) + + n_expected = tp_size * (num_compile_ranges - num_ranges_activated) + assert len(log_matches) == n_expected, ( + f'Could not find {n_expected} "Skipping AllReduceFusionPass" ' + f"(found {len(log_matches)}) in:\n {log_holder.text}" + ) + + return run diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py new file mode 100644 index 000000000..ef9b6be25 --- /dev/null +++ b/tests/compile/fusions_e2e/models.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import pytest + +from vllm.utils.flashinfer import has_flashinfer +from vllm.v1.attention.backends.registry import AttentionBackendEnum + +from .common import AttentionBackendCase, Matches, ModelFusionInfo, is_blackwell + +# Attn backends +FLASHINFER_ATTN = pytest.param( + AttentionBackendCase( + backend=AttentionBackendEnum.FLASHINFER, + model_kwargs=dict(kv_cache_dtype="fp8"), + ), + id="FLASHINFER", + marks=pytest.mark.skipif( + not is_blackwell() or not has_flashinfer(), + reason="FI backend requires Blackwell and FlashInfer", + ), +) + +TRITON_ATTN = pytest.param( + AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN" +) + +# Models +llama3_8b = ModelFusionInfo( + model_name="meta-llama/Llama-3.1-8B-Instruct", + matches=lambda n_layers: Matches( + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ), +) + +llama3_8b_fp8 = ModelFusionInfo( + model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", + matches=lambda n_layers: Matches( + rms_quant_fusion=n_layers * 2, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ), +) + +llama3_8b_fp4 = ModelFusionInfo( + model_name="nvidia/Llama-3.1-8B-Instruct-FP4", + matches=lambda n_layers: Matches( + rms_quant_fusion=0, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ), +) + +# MoEs cannot do act+quant fusion because those ops are hidden from torch.compile. +# MoEs also only expose 1 rms+quant fusion because the quant for up_proj is hidden. +# TODO(luka): https://github.com/vllm-project/vllm/issues/31985 +# Also, for MoEs, gemm+collective fusion only happens for dense GEMMs (o_proj/qkv proj) + +llama4_scout_fp8 = ModelFusionInfo( + model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", + hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, + matches=lambda n_layers: Matches( + rms_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ), +) + +llama4_scout_fp4 = ModelFusionInfo( + model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4", + hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, + matches=lambda n_layers: Matches( + rms_quant_fusion=0, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ), +) + +qwen3_a3b = ModelFusionInfo( + model_name="Qwen/Qwen3-30B-A3B", + matches=lambda n_layers: Matches( + norm_rope_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ), +) + +qwen3_a3b_fp8 = ModelFusionInfo( + model_name="Qwen/Qwen3-30B-A3B-FP8", + matches=lambda n_layers: Matches( + rms_quant_fusion=n_layers, + # TODO broken on Blackwell: + # https://github.com/vllm-project/vllm/issues/33295 + norm_rope_fusion=0 if is_blackwell() else n_layers, + attn_quant_fusion=0, # attn + group quant not supported + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ), +) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py new file mode 100644 index 000000000..03f102794 --- /dev/null +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -0,0 +1,146 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import pytest + +from vllm.config import PassConfig + +from .common import ( + INDUCTOR_GRAPH_PARTITION, + AttentionBackendCase, + Matches, + custom_ops_combos, + is_blackwell, +) +from .models import ( + FLASHINFER_ATTN, + TRITON_ATTN, + llama3_8b_fp4, + llama3_8b_fp8, + llama4_scout_fp4, + llama4_scout_fp8, + qwen3_a3b_fp8, +) + + +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm", + [ + (*llama3_8b_fp8, False), + (*llama4_scout_fp8, False), + (*qwen3_a3b_fp8, False), + (*qwen3_a3b_fp8, True), + ], +) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize("n_layers", [6]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +def test_tp1_fp8_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + use_deepgemm: bool, + run_e2e_fusion_test, + monkeypatch, +): + if use_deepgemm: + # TODO(luka/eliza) DeepGEMM uses different quants, matching not supported + # - on Blackwell, uses a special quant fp8, currently not supported + # - on Hopper, tma-aligned scales inhibit matching (fix WIP) + pytest.skip("DeepGEMM & quant matching not currently supported") + + matches = matches_fn(n_layers) + + if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops: + # This is why config forces +quant_fp8 by default + pytest.skip("native QuantFP8 matching not supported for group quant") + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + fuse_norm_quant=True, + fuse_act_quant=True, + fuse_attn_quant=True, + enable_qk_norm_rope_fusion=True, + ), + ) + + matches_check = [ + "rms_quant_fusion", + "act_quant_fusion", + "norm_rope_fusion", + "attn_quant_fusion", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + use_deepgemm=use_deepgemm, + ) + + +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + [llama3_8b_fp4, llama4_scout_fp4], +) +@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN]) +@pytest.mark.parametrize("n_layers", [6]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4") +def test_tp1_fp4_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, +): + matches = matches_fn(n_layers) + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + fuse_norm_quant=True, + fuse_act_quant=True, + fuse_attn_quant=True, + enable_qk_norm_rope_fusion=True, + ), + ) + + matches_check = ["act_quant_fusion", "attn_quant_fusion", "norm_rope_fusion"] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + ) diff --git a/tests/compile/fusions_e2e/test_tp2_ar_rms.py b/tests/compile/fusions_e2e/test_tp2_ar_rms.py new file mode 100644 index 000000000..18b19565c --- /dev/null +++ b/tests/compile/fusions_e2e/test_tp2_ar_rms.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import pytest + +from vllm.config import PassConfig + +from ...utils import multi_gpu_test +from .common import ( + INDUCTOR_GRAPH_PARTITION, + AttentionBackendCase, + Matches, + custom_ops_combos, + is_blackwell, +) +from .models import ( + FLASHINFER_ATTN, + TRITON_ATTN, + llama3_8b, + llama3_8b_fp4, + llama3_8b_fp8, + llama4_scout_fp4, + llama4_scout_fp8, + qwen3_a3b, + qwen3_a3b_fp8, +) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + # qwen3-fp8 should still fuse AR+rms even though group quant is not yet supported + [llama3_8b_fp8, llama4_scout_fp8, qwen3_a3b_fp8], +) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize("n_layers", [4]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +def test_tp2_ar_rms_fp8_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, + monkeypatch, +): + matches = matches_fn(n_layers) + + if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops: + # This is why config forces +quant_fp8 by default + pytest.skip("native QuantFP8 matching not supported for group quant") + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + fuse_norm_quant=True, + fuse_act_quant=True, + fuse_attn_quant=True, + enable_qk_norm_rope_fusion=True, + fuse_allreduce_rms=True, + ), + ) + + matches_check = [ + "rms_quant_fusion", + "act_quant_fusion", + "norm_rope_fusion", + "attn_quant_fusion", + "ar_rms_fusion", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + tp_size=2, + ) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + [llama3_8b_fp4, llama4_scout_fp4], +) +@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN]) +@pytest.mark.parametrize("n_layers", [4]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4") +def test_tp2_ar_rms_fp4_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, + monkeypatch, +): + matches = matches_fn(n_layers) + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + fuse_act_quant=True, + fuse_attn_quant=True, + fuse_allreduce_rms=True, + ), + ) + + matches_check = [ + "act_quant_fusion", + "attn_quant_fusion", + "ar_rms_fusion", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + tp_size=2, + ) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + [llama3_8b, qwen3_a3b], +) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN]) +@pytest.mark.parametrize("n_layers", [4]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +def test_tp2_ar_rms_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, +): + matches = matches_fn(n_layers) + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + enable_qk_norm_rope_fusion=True, + fuse_allreduce_rms=True, + ), + ) + + matches_check = [ + "norm_rope_fusion", + "ar_rms_fusion", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + tp_size=2, + ) diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py new file mode 100644 index 000000000..4769ca1e0 --- /dev/null +++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Callable + +import pytest + +from vllm.config import PassConfig + +from ...utils import multi_gpu_test +from .common import ( + INDUCTOR_GRAPH_PARTITION, + AttentionBackendCase, + Matches, + custom_ops_combos, + is_blackwell, +) +from .models import ( + FLASHINFER_ATTN, + TRITON_ATTN, + llama3_8b, + llama3_8b_fp8, + llama4_scout_fp8, + qwen3_a3b, +) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + [llama3_8b_fp8, llama4_scout_fp8], +) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize("n_layers", [4]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +def test_tp2_async_tp_fp8_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, + monkeypatch, +): + matches = matches_fn(n_layers) + + if is_blackwell(): + # Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns + monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel") + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + fuse_norm_quant=True, + fuse_act_quant=True, + fuse_attn_quant=True, + enable_qk_norm_rope_fusion=True, + enable_sp=True, + fuse_gemm_comms=True, + ), + ) + + matches_check = [ + "rms_quant_fusion", + "act_quant_fusion", + "norm_rope_fusion", + "attn_quant_fusion", + "sequence_parallel", + "async_tp", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + tp_size=2, + ) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.parametrize( + "model_name, matches_fn, model_kwargs, hf_overrides", + [llama3_8b, qwen3_a3b], +) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN]) +@pytest.mark.parametrize("n_layers", [4]) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) +@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) +def test_tp2_async_tp_fusions( + model_name: str, + matches_fn: Callable[[int], Matches], + model_kwargs: dict, + hf_overrides: Callable[[int], dict], + attn_backend: AttentionBackendCase, + n_layers: int, + custom_ops: str, + inductor_graph_partition: bool, + run_e2e_fusion_test, +): + matches = matches_fn(n_layers) + + # Reduce size of model and skip weight loading time + model_kwargs["hf_overrides"] = hf_overrides(n_layers) + model_kwargs["load_format"] = "dummy" + model_kwargs["max_model_len"] = 1024 + + compilation_config = dict( + use_inductor_graph_partition=inductor_graph_partition, + custom_ops=custom_ops.split(","), + pass_config=PassConfig( + enable_qk_norm_rope_fusion=True, + enable_sp=True, + fuse_gemm_comms=True, + ), + ) + + matches_check = [ + "norm_rope_fusion", + "sequence_parallel", + "async_tp", + ] + + run_e2e_fusion_test( + model_name, + matches, + model_kwargs, + attn_backend, + compilation_config, + matches_check, + tp_size=2, + ) diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index 50492a569..6515c5222 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -1,23 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -import logging -from typing import Any import pytest -import regex as re import torch._dynamo from tests.compile.backend import LazyInitPass, TestBackend -from tests.compile.fusion_test_utils import ( - CUSTOM_OPS_FP8, - MODELS_FP4, - MODELS_FP8, - Matches, - has_cuda_graph_wrapper_metadata, - is_blackwell, - run_model, -) from tests.utils import flat_product from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant @@ -31,7 +19,6 @@ from vllm.config import ( CacheConfig, CompilationConfig, CompilationMode, - CUDAGraphMode, ModelConfig, PassConfig, SchedulerConfig, @@ -47,7 +34,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ) from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer -from vllm.utils.torch_utils import is_torch_equal_or_newer from vllm.v1.attention.backend import AttentionMetadata from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.kv_cache_interface import AttentionSpec @@ -501,88 +487,3 @@ def test_attention_quant_pattern( # Check that results are close torch.testing.assert_close(result_unfused, result_fused, atol=1e-2, rtol=1e-2) - - -@pytest.mark.parametrize( - "model_name, model_kwargs, backend, matches, custom_ops", - # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8 - list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8)) - # quant_fp4 only has the custom impl - + list(flat_product(MODELS_FP4, [""])), -) -@pytest.mark.parametrize( - "inductor_graph_partition", - [ - pytest.param( - True, - marks=pytest.mark.skipif( - not has_cuda_graph_wrapper_metadata(), - reason="This test requires" - "torch._inductor.utils.CUDAGraphWrapperMetadata to run", - ), - ), - False, - ], -) -def test_attn_quant( - model_name: str, - model_kwargs: dict[str, Any], - backend: AttentionBackendEnum, - matches: Matches, - custom_ops: str, - inductor_graph_partition: bool, - caplog_mp_spawn, - monkeypatch, -): - if not current_platform.has_device_capability(90): - pytest.skip("test_attn_quant requires H100 (SM90) or B200 (SM100) GPU") - if backend == AttentionBackendEnum.FLASHINFER and ( - not is_blackwell() or not has_flashinfer() - ): - pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer") - if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"): - pytest.skip("Inductor graph partition requires torch>=2.9") - - custom_ops_list = custom_ops.split(",") if custom_ops else [] - - if inductor_graph_partition: - mode = CUDAGraphMode.FULL_AND_PIECEWISE - splitting_ops: list[str] | None = None - else: - # FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at - # CUDAGraphMode.NONE here because it derives an attention backend that - # does not support full cudagraphs - mode = CUDAGraphMode.FULL_DECODE_ONLY - splitting_ops = [] - - # Disable, compile cache to make sure custom passes run. - # Otherwise, we can't verify fusion happened through the logs. - monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1") - - # To capture subprocess logs, we need to know whether spawn or fork is used. - # Force spawn as it is more general. - monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn") - model_kwargs["attention_config"] = {"backend": backend.name} - - compilation_config = CompilationConfig( - # Testing properties - custom_ops=custom_ops_list, - use_inductor_graph_partition=inductor_graph_partition, - cudagraph_mode=mode, - splitting_ops=splitting_ops, - # Common - mode=CompilationMode.VLLM_COMPILE, - pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True), - # Inductor caches custom passes by default as well via uuid - inductor_compile_config={"force_disable_caches": True}, - ) - - with caplog_mp_spawn(logging.DEBUG) as log_holder: - run_model(compilation_config, model_name, **model_kwargs) - - log_matches = re.findall( - r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes", - log_holder.text, - ) - assert len(log_matches) == 1, log_holder.text - assert int(log_matches[0]) == matches.attention_fusion diff --git a/tests/test_config.py b/tests/test_config.py index f3c3003a0..6e2a59661 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1002,7 +1002,7 @@ def test_vllm_config_explicit_overrides(): assert config.compilation_config.pass_config.fuse_attn_quant is True # Explicit cudagraph mode override on quantized model at O2 - pass_config = PassConfig(fuse_gemm_comms=True) + pass_config = PassConfig(enable_qk_norm_rope_fusion=True) compilation_config = CompilationConfig( cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config ) @@ -1012,7 +1012,7 @@ def test_vllm_config_explicit_overrides(): compilation_config=compilation_config, ) assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE - assert config.compilation_config.pass_config.fuse_gemm_comms is True + assert config.compilation_config.pass_config.enable_qk_norm_rope_fusion is True # Mode should still use default for O2 assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 846ed50e0..93d88730e 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -766,7 +766,12 @@ class VllmConfig: if self.compilation_config.pass_config.fuse_gemm_comms: self.compilation_config.pass_config.enable_sp = True if self.compilation_config.pass_config.enable_sp: - if "-rms_norm" in self.compilation_config.custom_ops: + if self.parallel_config.tensor_parallel_size == 1: + logger.warning("Sequence Parallelism requires TP>1, disabling") + self.compilation_config.pass_config.enable_sp = False + self.compilation_config.pass_config.fuse_gemm_comms = False + + elif "-rms_norm" in self.compilation_config.custom_ops: logger.warning( "RMS norm force disabled, sequence parallelism might break" )