tests/compile/test_startup.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Cold start and warm start tests for vLLM-compile.

Cold start runs in a forked child (must fork before CUDA init) which
populates on-disk caches and asserts cold-start counters.  Warm start
then runs in the parent with clean in-memory state but populated caches.
"""

import multiprocessing as mp

from torch._dynamo.utils import counters

from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode

MODEL = "microsoft/Phi-tiny-MoE-instruct"


def _run_vllm(vllm_runner):
    with vllm_runner(
        MODEL,
        trust_remote_code=False,
        max_model_len=256,
        max_num_batched_tokens=1024,
        load_format="dummy",
        compilation_config=CompilationConfig(
            mode=CompilationMode.VLLM_COMPILE,
            cudagraph_mode=CUDAGraphMode.NONE,
        ),
        num_gpu_blocks_override=8,
    ):
        pass


def _cold_start(vllm_runner):
    counters.clear()
    with compilation_counter.expect(
        num_compiled_artifacts_saved=3,
        num_compiled_artifacts_loaded=0,
    ):
        _run_vllm(vllm_runner)
    assert counters["aot_autograd"]["total"] == 33
    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
    assert counters["aot_autograd"]["autograd_cache_hit"] == 0


def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")

    # Cold start in a forked child (must fork before CUDA init).
    # This model has 32 identical transformer layers which produce
    # 33 subgraphs after splitting on attention — only 3 are unique.
    ctx = mp.get_context("fork")
    p = ctx.Process(target=_cold_start, args=(vllm_runner,))
    p.start()
    p.join()
    assert p.exitcode == 0, "Cold-start child failed"

    # Warm start — compiled artifacts loaded from disk cache.
    counters.clear()
    with compilation_counter.expect(
        num_compiled_artifacts_loaded=3,
        # TODO: warm start should not save any artifacts
        # https://github.com/vllm-project/vllm/issues/35708
        num_compiled_artifacts_saved=1,
    ):
        _run_vllm(vllm_runner)
    assert counters["aot_autograd"]["total"] == 30
    assert counters["aot_autograd"]["autograd_cache_miss"] == 0
    assert counters["aot_autograd"]["autograd_cache_hit"] == 1
[torch.compile] Improve cold and warm start compile tests (#35709) Signed-off-by: Richard Zou <zou3519@gmail.com> 2026-03-02 14:27:06 -05:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""Cold start and warm start tests for vLLM-compile.`

			`Cold start runs in a forked child (must fork before CUDA init) which`
			`populates on-disk caches and asserts cold-start counters. Warm start`
			`then runs in the parent with clean in-memory state but populated caches.`
			`"""`

			`import multiprocessing as mp`

			`from torch._dynamo.utils import counters`

			`from vllm.compilation.counter import compilation_counter`
			`from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode`

			`MODEL = "microsoft/Phi-tiny-MoE-instruct"`


			`def _run_vllm(vllm_runner):`
			`with vllm_runner(`
			`MODEL,`
			`trust_remote_code=False,`
			`max_model_len=256,`
			`max_num_batched_tokens=1024,`
			`load_format="dummy",`
			`compilation_config=CompilationConfig(`
			`mode=CompilationMode.VLLM_COMPILE,`
			`cudagraph_mode=CUDAGraphMode.NONE,`
			`),`
			`num_gpu_blocks_override=8,`
			`):`
			`pass`


			`def _cold_start(vllm_runner):`
			`counters.clear()`
			`with compilation_counter.expect(`
			`num_compiled_artifacts_saved=3,`
			`num_compiled_artifacts_loaded=0,`
			`):`
			`_run_vllm(vllm_runner)`
			`assert counters["aot_autograd"]["total"] == 33`
			`assert counters["aot_autograd"]["autograd_cache_miss"] == 3`
			`assert counters["aot_autograd"]["autograd_cache_hit"] == 0`


			`def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):`
			`monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")`

			`# Cold start in a forked child (must fork before CUDA init).`
			`# This model has 32 identical transformer layers which produce`
			`# 33 subgraphs after splitting on attention — only 3 are unique.`
			`ctx = mp.get_context("fork")`
			`p = ctx.Process(target=_cold_start, args=(vllm_runner,))`
			`p.start()`
			`p.join()`
			`assert p.exitcode == 0, "Cold-start child failed"`

			`# Warm start — compiled artifacts loaded from disk cache.`
			`counters.clear()`
			`with compilation_counter.expect(`
			`num_compiled_artifacts_loaded=3,`
			`# TODO: warm start should not save any artifacts`
			`# https://github.com/vllm-project/vllm/issues/35708`
			`num_compiled_artifacts_saved=1,`
			`):`
			`_run_vllm(vllm_runner)`
			`assert counters["aot_autograd"]["total"] == 30`
			`assert counters["aot_autograd"]["autograd_cache_miss"] == 0`
			`assert counters["aot_autograd"]["autograd_cache_hit"] == 1`