113 lines
5.5 KiB
YAML
113 lines
5.5 KiB
YAML
group: Model Runner V2
|
|
depends_on:
|
|
- image-build
|
|
steps:
|
|
- label: Model Runner V2 Core Tests
|
|
timeout_in_minutes: 45
|
|
source_file_dependencies:
|
|
- vllm/v1/worker/gpu/
|
|
- vllm/v1/worker/gpu_worker.py
|
|
- vllm/v1/core/sched/
|
|
- vllm/v1/attention/
|
|
- tests/v1/engine/test_llm_engine.py
|
|
- tests/v1/e2e/
|
|
- tests/entrypoints/llm/test_struct_output_generate.py
|
|
commands:
|
|
- set -x
|
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
|
- pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
|
|
# This requires eager until we sort out CG correctness issues.
|
|
# TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
|
|
- ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
|
|
- pytest -v -s v1/e2e/general/test_context_length.py
|
|
- pytest -v -s v1/e2e/general/test_min_tokens.py
|
|
# Temporary hack filter to exclude ngram spec decoding based tests.
|
|
- pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
|
|
|
|
- label: Model Runner V2 Examples
|
|
timeout_in_minutes: 45
|
|
working_dir: "/vllm-workspace/examples"
|
|
source_file_dependencies:
|
|
- vllm/v1/worker/gpu/
|
|
- vllm/v1/core/sched/
|
|
- vllm/v1/worker/gpu_worker.py
|
|
- examples/offline_inference/
|
|
- examples/basic/offline_inference/
|
|
- examples/pooling/embed/vision_embedding_offline.py
|
|
- examples/others/tensorize_vllm_model.py
|
|
commands:
|
|
- set -x
|
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
|
- pip install tensorizer # for tensorizer test
|
|
- python3 basic/offline_inference/chat.py # for basic
|
|
- python3 basic/offline_inference/generate.py --model facebook/opt-125m
|
|
#- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO
|
|
#- python3 basic/offline_inference/embed.py # TODO
|
|
# for multi-modal models
|
|
- python3 offline_inference/audio_language.py --seed 0
|
|
- python3 offline_inference/vision_language.py --seed 0
|
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
# for pooling models
|
|
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
|
# for features demo
|
|
- python3 offline_inference/prefix_caching.py
|
|
- python3 offline_inference/llm_engine_example.py
|
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
|
|
|
- label: Model Runner V2 Distributed (2 GPUs)
|
|
timeout_in_minutes: 45
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_devices: 2
|
|
source_file_dependencies:
|
|
- vllm/v1/worker/gpu/
|
|
- vllm/v1/worker/gpu_worker.py
|
|
- tests/basic_correctness/test_basic_correctness.py
|
|
- tests/v1/distributed/test_async_llm_dp.py
|
|
- tests/v1/distributed/test_eagle_dp.py
|
|
commands:
|
|
- set -x
|
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
|
# The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
|
|
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
|
|
# https://github.com/NVIDIA/nccl/issues/1838
|
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
|
|
- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
|
|
timeout_in_minutes: 60
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_devices: 4
|
|
source_file_dependencies:
|
|
- vllm/v1/worker/gpu/
|
|
- vllm/v1/worker/gpu_worker.py
|
|
- tests/distributed/test_pipeline_parallel.py
|
|
- tests/distributed/test_pp_cudagraph.py
|
|
commands:
|
|
- set -x
|
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
|
- pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
|
|
- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"
|
|
|
|
- label: Model Runner V2 Spec Decode
|
|
timeout_in_minutes: 30
|
|
working_dir: "/vllm-workspace/tests"
|
|
source_file_dependencies:
|
|
- vllm/v1/worker/gpu/
|
|
- vllm/v1/worker/gpu_worker.py
|
|
- tests/v1/spec_decode/test_max_len.py
|
|
- tests/v1/spec_decode/test_probabilistic_rejection_sampler_utils.py
|
|
- tests/v1/spec_decode/test_synthetic_rejection_sampler_utils.py
|
|
- tests/v1/e2e/spec_decode/test_spec_decode.py
|
|
commands:
|
|
- set -x
|
|
- export VLLM_USE_V2_MODEL_RUNNER=1
|
|
- pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
|
|
- pytest -v -s v1/spec_decode/test_probabilistic_rejection_sampler_utils.py
|
|
- pytest -v -s v1/spec_decode/test_synthetic_rejection_sampler_utils.py
|
|
- pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"
|