[CI] Split V1 e2e + engine (1 GPU) into separate jobs (#36945)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -369,7 +369,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
|
||||
|
||||
- label: V1 Test e2e (4 GPUs) # 65min
|
||||
timeout_in_minutes: 90
|
||||
@@ -380,7 +380,7 @@ steps:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
@@ -1744,7 +1744,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# Only run tests that need exactly 2 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
|
||||
|
||||
- label: V1 Test e2e (4 GPUs) # 65min
|
||||
timeout_in_minutes: 90
|
||||
@@ -1759,7 +1759,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# Only run tests that need 4 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
@@ -3494,7 +3494,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# Only run tests that need exactly 2 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
|
||||
|
||||
- label: V1 Test e2e (4 GPUs) # 65min
|
||||
timeout_in_minutes: 90
|
||||
@@ -3509,7 +3509,7 @@ steps:
|
||||
- tests/v1
|
||||
commands:
|
||||
# Only run tests that need 4 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
|
||||
- label: V1 Test entrypoints # 35min
|
||||
timeout_in_minutes: 50
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
group: Engine
|
||||
depends_on:
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Engine
|
||||
@@ -14,28 +14,30 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
|
||||
- label: V1 e2e + engine (1 GPU)
|
||||
timeout_in_minutes: 45
|
||||
- label: Engine (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/v1
|
||||
- vllm/v1/engine/
|
||||
- tests/v1/engine/
|
||||
commands:
|
||||
# TODO: accuracy does not match, whether setting
|
||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||
- pytest -v -s v1/e2e
|
||||
# Run this test standalone for now;
|
||||
# need to untangle use (implicit) use of spawn/fork across the tests.
|
||||
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
||||
# Run the rest of v1/engine tests
|
||||
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_1
|
||||
depends_on:
|
||||
- image-build-amd
|
||||
commands:
|
||||
- pytest -v -s v1/e2e
|
||||
- pytest -v -s v1/engine
|
||||
|
||||
- label: e2e Scheduling (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/
|
||||
- tests/v1/e2e/general/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/general/test_async_scheduling.py
|
||||
|
||||
- label: e2e Core (1 GPU)
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/
|
||||
- tests/v1/e2e/general/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
|
||||
|
||||
- label: V1 e2e (2 GPUs)
|
||||
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
||||
@@ -46,7 +48,7 @@ steps:
|
||||
- tests/v1/e2e
|
||||
commands:
|
||||
# Only run tests that need exactly 2 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_2
|
||||
@@ -62,7 +64,7 @@ steps:
|
||||
- tests/v1/e2e
|
||||
commands:
|
||||
# Only run tests that need 4 GPUs
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
|
||||
mirror:
|
||||
amd:
|
||||
device: mi325_4
|
||||
|
||||
@@ -18,9 +18,9 @@ steps:
|
||||
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
|
||||
# This requires eager until we sort out CG correctness issues.
|
||||
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
|
||||
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/test_async_scheduling.py -k "not ngram"
|
||||
- pytest -v -s v1/e2e/test_context_length.py
|
||||
- pytest -v -s v1/e2e/test_min_tokens.py
|
||||
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
|
||||
- pytest -v -s v1/e2e/general/test_context_length.py
|
||||
- pytest -v -s v1/e2e/general/test_min_tokens.py
|
||||
# Temporary hack filter to exclude ngram spec decoding based tests.
|
||||
- pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
|
||||
|
||||
@@ -102,9 +102,9 @@ steps:
|
||||
- vllm/v1/worker/gpu/
|
||||
- vllm/v1/worker/gpu_worker.py
|
||||
- tests/v1/spec_decode/test_max_len.py
|
||||
- tests/v1/e2e/test_spec_decode.py
|
||||
- tests/v1/e2e/spec_decode/test_spec_decode.py
|
||||
commands:
|
||||
- set -x
|
||||
- export VLLM_USE_V2_MODEL_RUNNER=1
|
||||
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
|
||||
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle or mtp"
|
||||
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
|
||||
|
||||
40
.buildkite/test_areas/spec_decode.yaml
Normal file
40
.buildkite/test_areas/spec_decode.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
group: Spec Decode
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Spec Decode Eagle
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"
|
||||
|
||||
- label: Spec Decode Speculators + MTP
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- vllm/transformers_utils/configs/speculators/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"
|
||||
|
||||
- label: Spec Decode Ngram + Suffix
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"
|
||||
|
||||
- label: Spec Decode Draft Model
|
||||
timeout_in_minutes: 30
|
||||
source_file_dependencies:
|
||||
- vllm/v1/spec_decode/
|
||||
- vllm/v1/worker/gpu/spec_decode/
|
||||
- tests/v1/e2e/spec_decode/
|
||||
commands:
|
||||
- pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"
|
||||
Reference in New Issue
Block a user