[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Kevin H. Luu
2026-03-13 14:16:02 -07:00
committed by GitHub
parent 0005d2a3c9
commit f1816fb192
18 changed files with 81 additions and 39 deletions

View File

@@ -369,7 +369,7 @@ steps:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90
@@ -380,7 +380,7 @@ steps:
- vllm/
- tests/v1
commands:
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
@@ -1744,7 +1744,7 @@ steps:
- tests/v1
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90
@@ -1759,7 +1759,7 @@ steps:
- tests/v1
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50
@@ -3494,7 +3494,7 @@ steps:
- tests/v1
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
- label: V1 Test e2e (4 GPUs) # 65min
timeout_in_minutes: 90
@@ -3509,7 +3509,7 @@ steps:
- tests/v1
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
- label: V1 Test entrypoints # 35min
timeout_in_minutes: 50

View File

@@ -1,5 +1,5 @@
group: Engine
depends_on:
depends_on:
- image-build
steps:
- label: Engine
@@ -14,28 +14,30 @@ steps:
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
- label: V1 e2e + engine (1 GPU)
timeout_in_minutes: 45
- label: Engine (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/
- tests/v1
- vllm/v1/engine/
- tests/v1/engine/
commands:
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- pytest -v -s v1/e2e
# Run this test standalone for now;
# need to untangle use (implicit) use of spawn/fork across the tests.
- pytest -v -s v1/engine/test_preprocess_error_handling.py
# Run the rest of v1/engine tests
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
mirror:
amd:
device: mi325_1
depends_on:
- image-build-amd
commands:
- pytest -v -s v1/e2e
- pytest -v -s v1/engine
- label: e2e Scheduling (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general/test_async_scheduling.py
- label: e2e Core (1 GPU)
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/
- tests/v1/e2e/general/
commands:
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
- label: V1 e2e (2 GPUs)
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
@@ -46,7 +48,7 @@ steps:
- tests/v1/e2e
commands:
# Only run tests that need exactly 2 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
mirror:
amd:
device: mi325_2
@@ -62,7 +64,7 @@ steps:
- tests/v1/e2e
commands:
# Only run tests that need 4 GPUs
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
mirror:
amd:
device: mi325_4

View File

@@ -18,9 +18,9 @@ steps:
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
# This requires eager until we sort out CG correctness issues.
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/test_context_length.py
- pytest -v -s v1/e2e/test_min_tokens.py
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
- pytest -v -s v1/e2e/general/test_context_length.py
- pytest -v -s v1/e2e/general/test_min_tokens.py
# Temporary hack filter to exclude ngram spec decoding based tests.
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
@@ -102,9 +102,9 @@ steps:
- vllm/v1/worker/gpu/
- vllm/v1/worker/gpu_worker.py
- tests/v1/spec_decode/test_max_len.py
- tests/v1/e2e/test_spec_decode.py
- tests/v1/e2e/spec_decode/test_spec_decode.py
commands:
- set -x
- export VLLM_USE_V2_MODEL_RUNNER=1
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle or mtp"
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"

View File

@@ -0,0 +1,40 @@
group: Spec Decode
depends_on:
- image-build
steps:
- label: Spec Decode Eagle
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
- label: Spec Decode Speculators + MTP
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- vllm/transformers_utils/configs/speculators/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
- label: Spec Decode Ngram + Suffix
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
- label: Spec Decode Draft Model
timeout_in_minutes: 30
source_file_dependencies:
- vllm/v1/spec_decode/
- vllm/v1/worker/gpu/spec_decode/
- tests/v1/e2e/spec_decode/
commands:
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"