group: Compile depends_on: - image-build steps: - label: Sequence Parallel Correctness Tests (2 GPUs) timeout_in_minutes: 50 working_dir: "/vllm-workspace/" num_devices: 2 source_file_dependencies: - vllm/model_executor/layers/ - vllm/compilation/ - vllm/v1/worker/ - vllm/v1/cudagraph_dispatcher.py - tests/compile/correctness_e2e/test_sequence_parallel.py commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - label: Sequence Parallel Correctness Tests (2xH100) timeout_in_minutes: 50 working_dir: "/vllm-workspace/" device: h100 optional: true num_devices: 2 commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - label: AsyncTP Correctness Tests (2xH100) timeout_in_minutes: 50 working_dir: "/vllm-workspace/" device: h100 optional: true num_devices: 2 commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py - label: Distributed Compile Unit Tests (2xH100) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" device: h100 num_devices: 2 source_file_dependencies: - vllm/compilation/ - vllm/model_executor/layers - tests/compile/passes/distributed/ commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -s -v tests/compile/passes/distributed - label: Fusion and Compile Unit Tests (B200) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" device: b200 source_file_dependencies: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/ - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/attention/attention.py - vllm/v1/attention/backends/flashinfer.py - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes - tests/compile/passes/test_fusion_attn.py - tests/compile/passes/test_silu_mul_quant_fusion.py - tests/compile/passes/distributed/test_fusion_all_reduce.py - tests/compile/fullgraph/test_full_graph.py commands: # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell - nvidia-smi - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_devices=2 is not set - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) # TODO(luka) move to H100 once pass tests run on H100 - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - label: Fusion E2E Quick (H100) timeout_in_minutes: 15 working_dir: "/vllm-workspace/" device: h100 num_devices: 1 source_file_dependencies: - csrc/quantization/ - vllm/model_executor/ - vllm/v1/attention/ - vllm/compilation/ - tests/compile/fusions_e2e/ commands: - nvidia-smi # Run all models and attn backends but only Inductor partition and native custom ops - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" - label: Fusion E2E Config Sweep (H100) timeout_in_minutes: 30 working_dir: "/vllm-workspace/" device: h100 num_devices: 1 source_file_dependencies: - csrc/quantization/ - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/attention/attention.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - tests/compile/fusions_e2e/ commands: - nvidia-smi # Run just llama3 (fp8) for all config combinations - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" - label: Fusion E2E Config Sweep (B200) timeout_in_minutes: 30 working_dir: "/vllm-workspace/" device: b200 num_devices: 1 optional: true commands: - nvidia-smi # Run all models but only FLASHINFER, Inductor partition and native custom ops # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition) - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)" - label: Fusion E2E TP2 Quick (H100) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" device: h100 num_devices: 2 source_file_dependencies: - csrc/quantization/ - vllm/model_executor/ - vllm/v1/attention/ - vllm/compilation/ - tests/compile/fusions_e2e/ commands: - nvidia-smi # Run all models and attn backends but only Inductor partition and native custom ops - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" - label: Fusion E2E TP2 AR-RMS Config Sweep (H100) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: h100 num_devices: 2 source_file_dependencies: - csrc/quantization/ - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/attention/attention.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - tests/compile/fusions_e2e/ commands: - nvidia-smi # Run just llama3 (fp8 & bf16) for all config combinations - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3" - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: h100 num_devices: 2 source_file_dependencies: - csrc/quantization/ - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/attention/attention.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - tests/compile/fusions_e2e/ commands: - nvidia-smi # Run just llama3 (fp8 & bf16) for all config combinations - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" - label: Fusion E2E TP2 (B200) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" device: b200 num_devices: 2 source_file_dependencies: - csrc/quantization/ - vllm/model_executor/ - vllm/v1/attention/ - vllm/compilation/ - tests/compile/fusions_e2e/ commands: - nvidia-smi # Run all models but only FLASHINFER, Inductor partition and native custom ops # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # for ar-rms-quant-fp4, also sweep llama3 - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4" - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"