group: Model Runner V2 depends_on: - image-build steps: - label: Model Runner V2 Core Tests timeout_in_minutes: 45 source_file_dependencies: - vllm/v1/worker/gpu/ - vllm/v1/worker/gpu_worker.py - vllm/v1/core/sched/ - vllm/v1/attention/ - tests/v1/engine/test_llm_engine.py - tests/v1/e2e/ - tests/v1/entrypoints/llm/test_struct_output_generate.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics" # This requires eager until we sort out CG correctness issues. # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged. - ENFORCE_EAGER=1 pytest -v -s v1/e2e/test_async_scheduling.py -k "not ngram" - pytest -v -s v1/e2e/test_context_length.py - pytest -v -s v1/e2e/test_min_tokens.py # Temporary hack filter to exclude ngram spec decoding based tests. - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" - label: Model Runner V2 Examples timeout_in_minutes: 45 working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/v1/worker/gpu/ - vllm/v1/core/sched/ - vllm/v1/worker/gpu_worker.py - examples/offline_inference/ - examples/basic/offline_inference/ - examples/pooling/embed/vision_embedding_offline.py - examples/others/tensorize_vllm_model.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 - pip install tensorizer # for tensorizer test - python3 basic/offline_inference/chat.py # for basic - python3 basic/offline_inference/generate.py --model facebook/opt-125m #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 # TODO #- python3 basic/offline_inference/embed.py # TODO # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 # TODO: uncomment once https://github.com/vllm-project/vllm/pull/35790 is merged. #- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 # TODO # for pooling models - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 - label: Model Runner V2 Distributed (2 GPUs) timeout_in_minutes: 45 working_dir: "/vllm-workspace/tests" num_devices: 2 source_file_dependencies: - vllm/v1/worker/gpu/ - vllm/v1/worker/gpu_worker.py - tests/basic_correctness/test_basic_correctness.py - tests/v1/distributed/test_async_llm_dp.py - tests/v1/distributed/test_eagle_dp.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported. - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True" # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray" - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py # These require fix https://github.com/vllm-project/vllm/pull/36280 - label: Model Runner V2 Pipeline Parallelism (4 GPUs) timeout_in_minutes: 60 working_dir: "/vllm-workspace/tests" num_devices: 4 source_file_dependencies: - vllm/v1/worker/gpu/ - vllm/v1/worker/gpu_worker.py - tests/distributed/test_pipeline_parallel.py #- tests/distributed/test_pp_cudagraph.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba" # TODO: Uncomment once https://github.com/vllm-project/vllm/pull/35162 is merged. #- pytest -v -s distributed/test_pp_cudagraph.py -k "not ray" - label: Model Runner V2 Spec Decode timeout_in_minutes: 30 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/v1/worker/gpu/ - vllm/v1/worker/gpu_worker.py - tests/v1/spec_decode/test_max_len.py - tests/v1/e2e/test_spec_decode.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp" - pytest -v -s v1/e2e/test_spec_decode.py -k "eagle or mtp"