71 lines
1.9 KiB
YAML
71 lines
1.9 KiB
YAML
group: Engine
|
|
depends_on:
|
|
- image-build
|
|
steps:
|
|
- label: Engine
|
|
timeout_in_minutes: 15
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/engine
|
|
- tests/test_sequence
|
|
- tests/test_config
|
|
- tests/test_logger
|
|
- tests/test_vllm_port
|
|
commands:
|
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
|
|
|
- label: V1 e2e + engine (1 GPU)
|
|
timeout_in_minutes: 45
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1
|
|
commands:
|
|
# TODO: accuracy does not match, whether setting
|
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
|
- pytest -v -s v1/e2e
|
|
# Run this test standalone for now;
|
|
# need to untangle use (implicit) use of spawn/fork across the tests.
|
|
- pytest -v -s v1/engine/test_preprocess_error_handling.py
|
|
# Run the rest of v1/engine tests
|
|
- pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
|
|
mirror:
|
|
amd:
|
|
device: mi325_1
|
|
depends_on:
|
|
- image-build-amd
|
|
commands:
|
|
- pytest -v -s v1/e2e
|
|
- pytest -v -s v1/engine
|
|
|
|
- label: V1 e2e (2 GPUs)
|
|
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
|
optional: true
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1/e2e
|
|
commands:
|
|
# Only run tests that need exactly 2 GPUs
|
|
- pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
|
|
mirror:
|
|
amd:
|
|
device: mi325_2
|
|
depends_on:
|
|
- image-build-amd
|
|
|
|
- label: V1 e2e (4 GPUs)
|
|
timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
|
|
optional: true
|
|
num_devices: 4
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1/e2e
|
|
commands:
|
|
# Only run tests that need 4 GPUs
|
|
- pytest -v -s v1/e2e/test_spec_decode.py -k "eagle_correctness_heavy"
|
|
mirror:
|
|
amd:
|
|
device: mi325_4
|
|
depends_on:
|
|
- image-build-amd
|