Signed-off-by: Carl You <4531192+carlyou@users.noreply.github.com> Signed-off-by: Carl Y <4531192+carlyou@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
214 lines
8.0 KiB
YAML
214 lines
8.0 KiB
YAML
group: Compile
|
|
depends_on:
|
|
- image-build
|
|
steps:
|
|
- label: Sequence Parallel Correctness Tests (2 GPUs)
|
|
timeout_in_minutes: 50
|
|
working_dir: "/vllm-workspace/"
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- vllm/model_executor/layers/
|
|
- vllm/compilation/
|
|
- vllm/v1/worker/
|
|
- vllm/v1/cudagraph_dispatcher.py
|
|
- tests/compile/correctness_e2e/test_sequence_parallel.py
|
|
commands:
|
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
|
|
|
- label: Sequence Parallel Correctness Tests (2xH100)
|
|
timeout_in_minutes: 50
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
optional: true
|
|
num_devices: 2
|
|
commands:
|
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
|
|
|
|
- label: AsyncTP Correctness Tests (2xH100)
|
|
timeout_in_minutes: 50
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
optional: true
|
|
num_devices: 2
|
|
commands:
|
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
|
|
|
- label: AsyncTP Correctness Tests (B200)
|
|
timeout_in_minutes: 50
|
|
working_dir: "/vllm-workspace/"
|
|
device: b200
|
|
optional: true
|
|
num_devices: 2
|
|
commands:
|
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
|
|
|
|
- label: Distributed Compile Unit Tests (2xH100)
|
|
timeout_in_minutes: 20
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- vllm/compilation/
|
|
- vllm/model_executor/layers
|
|
- tests/compile/passes/distributed/
|
|
commands:
|
|
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
|
|
- pytest -s -v tests/compile/passes/distributed
|
|
|
|
- label: Fusion and Compile Unit Tests (2xB200)
|
|
timeout_in_minutes: 20
|
|
working_dir: "/vllm-workspace/"
|
|
device: b200
|
|
source_file_dependencies:
|
|
- csrc/quantization/fp4/
|
|
- vllm/model_executor/layers/quantization/
|
|
- vllm/model_executor/layers/layernorm.py
|
|
- vllm/model_executor/layers/activation.py
|
|
- vllm/model_executor/layers/attention/attention.py
|
|
- vllm/v1/attention/backends/flashinfer.py
|
|
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
|
|
- tests/compile/passes/test_fusion_attn.py
|
|
- tests/compile/passes/test_mla_attn_quant_fusion.py
|
|
- tests/compile/passes/test_silu_mul_quant_fusion.py
|
|
- tests/compile/passes/distributed/test_fusion_all_reduce.py
|
|
- tests/compile/fullgraph/test_full_graph.py
|
|
commands:
|
|
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
|
|
- nvidia-smi
|
|
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
|
|
- pytest -v -s tests/compile/passes/test_mla_attn_quant_fusion.py
|
|
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
|
|
# this runner has 2 GPUs available even though num_devices=2 is not set
|
|
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
|
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
|
# TODO(luka) move to H100 once pass tests run on H100
|
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
|
|
|
- label: Fusion E2E Quick (H100)
|
|
timeout_in_minutes: 15
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
num_devices: 1
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/model_executor/
|
|
- vllm/v1/attention/
|
|
- vllm/compilation/
|
|
- tests/compile/fusions_e2e/
|
|
commands:
|
|
- nvidia-smi
|
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
|
|
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and (qwen3 or deepseek)"
|
|
|
|
- label: Fusion E2E Config Sweep (H100)
|
|
timeout_in_minutes: 30
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
num_devices: 1
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/compilation/
|
|
# can affect pattern matching
|
|
- vllm/model_executor/layers/layernorm.py
|
|
- vllm/model_executor/layers/activation.py
|
|
- vllm/model_executor/layers/attention/attention.py
|
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
- tests/compile/fusions_e2e/
|
|
commands:
|
|
- nvidia-smi
|
|
# Run just llama3 (fp8) for all config combinations
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
|
|
|
|
- label: Fusion E2E Config Sweep (B200)
|
|
timeout_in_minutes: 30
|
|
working_dir: "/vllm-workspace/"
|
|
device: b200
|
|
num_devices: 1
|
|
optional: true
|
|
commands:
|
|
- nvidia-smi
|
|
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
|
# Qwen/Deepseek requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
|
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek)) or llama-3)"
|
|
|
|
- label: Fusion E2E TP2 Quick (H100)
|
|
timeout_in_minutes: 20
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/model_executor/
|
|
- vllm/v1/attention/
|
|
- vllm/compilation/
|
|
- tests/compile/fusions_e2e/
|
|
commands:
|
|
- nvidia-smi
|
|
# Run all models and attn backends but only Inductor partition and native custom ops
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|
|
|
|
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
|
|
timeout_in_minutes: 40
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/compilation/
|
|
# can affect pattern matching
|
|
- vllm/model_executor/layers/layernorm.py
|
|
- vllm/model_executor/layers/activation.py
|
|
- vllm/model_executor/layers/attention/attention.py
|
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
- tests/compile/fusions_e2e/
|
|
commands:
|
|
- nvidia-smi
|
|
# Run just llama3 (fp8 & bf16) for all config combinations
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
|
|
|
|
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
|
|
timeout_in_minutes: 40
|
|
working_dir: "/vllm-workspace/"
|
|
device: h100
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/compilation/
|
|
# can affect pattern matching
|
|
- vllm/model_executor/layers/layernorm.py
|
|
- vllm/model_executor/layers/activation.py
|
|
- vllm/model_executor/layers/attention/attention.py
|
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
- tests/compile/fusions_e2e/
|
|
commands:
|
|
- nvidia-smi
|
|
# Run just llama3 (fp8 & bf16) for all config combinations
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
|
|
|
|
- label: Fusion E2E TP2 (B200)
|
|
timeout_in_minutes: 20
|
|
working_dir: "/vllm-workspace/"
|
|
device: b200
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/model_executor/
|
|
- vllm/v1/attention/
|
|
- vllm/compilation/
|
|
- tests/compile/fusions_e2e/
|
|
commands:
|
|
- nvidia-smi
|
|
# Run all models but only FLASHINFER, Inductor partition and native custom ops
|
|
# include qwen/deepseek with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
|
|
# for ar-rms-quant-fp4, also sweep llama3
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))) or Llama-3.1-8B-Instruct-FP4"
|
|
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and (qwen3 or deepseek))"
|