vllm/.buildkite/test_areas/model_runner_v2.yaml

group: Model Runner V2
depends_on:
  - image-build
steps:
- label: Model Runner V2 Core Tests
  timeout_in_minutes: 45
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - vllm/v1/core/sched/
  - vllm/v1/attention/
  - tests/v1/engine/test_llm_engine.py
  - tests/v1/e2e/
  - tests/entrypoints/llm/test_struct_output_generate.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/engine/test_llm_engine.py -k "not test_engine_metrics"
  # This requires eager until we sort out CG correctness issues.
  # TODO: remove ENFORCE_EAGER here after https://github.com/vllm-project/vllm/pull/32936 is merged.
  - ENFORCE_EAGER=1 pytest -v -s v1/e2e/general/test_async_scheduling.py -k "not ngram"
  - pytest -v -s v1/e2e/general/test_context_length.py
  - pytest -v -s v1/e2e/general/test_min_tokens.py
  # Temporary hack filter to exclude ngram spec decoding based tests.
  - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"

- label: Model Runner V2 Examples
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/core/sched/
    - vllm/v1/worker/gpu_worker.py
    - examples/offline_inference/
    - examples/basic/offline_inference/
    - examples/pooling/embed/vision_embedding_offline.py
    - examples/others/tensorize_vllm_model.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pip install tensorizer # for tensorizer test
    - python3 basic/offline_inference/chat.py # for basic
    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
    #- python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10  # TODO
    #- python3 basic/offline_inference/embed.py   # TODO
    # for multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536

- label: Model Runner V2 Distributed (2 GPUs)
  timeout_in_minutes: 45
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/basic_correctness/test_basic_correctness.py
    - tests/v1/distributed/test_async_llm_dp.py
    - tests/v1/distributed/test_eagle_dp.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    # The "and not True" here is a hacky way to exclude the prompt_embeds cases which aren't yet supported.
    - TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -m 'distributed(num_gpus=2)' -k "not ray and not True"
    # https://github.com/NVIDIA/nccl/issues/1838
    - export NCCL_CUMEM_HOST_ENABLE=0
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py -k "not ray"
    - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py

- label: Model Runner V2 Pipeline Parallelism (4 GPUs)
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 4
  source_file_dependencies:
    - vllm/v1/worker/gpu/
    - vllm/v1/worker/gpu_worker.py
    - tests/distributed/test_pipeline_parallel.py
    - tests/distributed/test_pp_cudagraph.py
  commands:
    - set -x
    - export VLLM_USE_V2_MODEL_RUNNER=1
    - pytest -v -s distributed/test_pipeline_parallel.py -k "not ray and not Jamba"
    - pytest -v -s distributed/test_pp_cudagraph.py -k "not ray"

- label: Model Runner V2 Spec Decode
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/worker/gpu/
  - vllm/v1/worker/gpu_worker.py
  - tests/v1/spec_decode/test_max_len.py
  - tests/v1/spec_decode/test_probabilistic_rejection_sampler_utils.py
  - tests/v1/spec_decode/test_synthetic_rejection_sampler_utils.py
  - tests/v1/e2e/spec_decode/test_spec_decode.py
  commands:
  - set -x
  - export VLLM_USE_V2_MODEL_RUNNER=1
  - pytest -v -s v1/spec_decode/test_max_len.py -k "eagle or mtp"
  - pytest -v -s v1/spec_decode/test_probabilistic_rejection_sampler_utils.py
  - pytest -v -s v1/spec_decode/test_synthetic_rejection_sampler_utils.py
  - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle or mtp"