Files
vllm/tests/compile/test_startup.py
2026-03-22 16:02:03 +08:00

88 lines
3.0 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cold start and warm start tests for vLLM-compile.
Cold start runs in a forked child (must fork before CUDA init) which
populates on-disk caches and asserts cold-start counters. Warm start
then runs in the parent with clean in-memory state but populated caches.
"""
import multiprocessing as mp
import pytest
from torch._dynamo.utils import counters
import vllm.envs as envs
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
from vllm.utils.torch_utils import is_torch_equal_or_newer
from ..utils import fork_new_process_for_each_test
MODEL = "microsoft/Phi-tiny-MoE-instruct"
def _run_vllm(vllm_runner):
with vllm_runner(
MODEL,
trust_remote_code=False,
max_model_len=256,
max_num_batched_tokens=1024,
load_format="dummy",
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
cudagraph_mode=CUDAGraphMode.NONE,
),
num_gpu_blocks_override=8,
):
pass
def _cold_start(vllm_runner):
counters.clear()
with compilation_counter.expect(
num_compiled_artifacts_saved=3,
num_compiled_artifacts_loaded=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 33
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
@fork_new_process_for_each_test
@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
# 33 subgraphs after splitting on attention — only 3 are unique.
ctx = mp.get_context("fork")
p = ctx.Process(target=_cold_start, args=(vllm_runner,))
p.start()
p.join()
assert p.exitcode == 0, "Cold-start child failed"
# Warm start — compiled artifacts loaded from disk cache.
counters.clear()
with compilation_counter.expect(
num_compiled_artifacts_loaded=3,
num_compiled_artifacts_saved=0,
):
_run_vllm(vllm_runner)
mega_aot_active = envs.VLLM_USE_MEGA_AOT_ARTIFACT and is_torch_equal_or_newer(
"2.10.0"
)
if mega_aot_active:
# MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
# subgraphs.
assert counters["aot_autograd"]["total"] == 0
else:
assert counters["aot_autograd"]["total"] == 30
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0
) # No miss at aot_autograd level causing disk I/O.