[torch.compile] Improve cold and warm start compile tests (#35709)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-03-02 14:27:06 -05:00
parent 2a9e3347e9
commit d1a6e96d9e
5 changed files with 82 additions and 48 deletions
--- a/tests/compile/test_cold_start.py
+++ b/tests/compile/test_cold_start.py
@@ -1,48 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from torch._dynamo.utils import counters
-
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
-
-
-def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
-    # Run in same process so we can access PyTorch's internal counters
-    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
-
-    # I'm not sure if this is going to affect the numbers
-    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
-
-    # Force cold compilation
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    compilation_config = CompilationConfig(
-        mode=CompilationMode.VLLM_COMPILE,
-        cudagraph_mode=CUDAGraphMode.NONE,  # make the model loading faster
-    )
-
-    counters.clear()
-
-    _ = LLM(
-        model="microsoft/Phi-tiny-MoE-instruct",
-        max_model_len=256,
-        load_format="dummy",  # make the model loading faster
-        compilation_config=compilation_config,
-        num_gpu_blocks_override=8,  # make the model loading faster
-    )
-
-    # vLLM-compile cold start is special. By default, we do
-    # one full dynamo capture of the entire forward pass.
-    # The forward pass consists of 32 transformer layers.
-    # Then, we split on the attention operation. This results in
-    # 33 subgraphs (not including the attention operation).
-    # We then generate compiled artifacts for the unique subgraphs.
-    #
-    # There are actually only 3 unique subgraphs for this model
-    # (all of its transformer layers are the same modulo weights);
-    # this is true for most vLLM models.
-    # So we test that during cold start, we are only compling
-    # for 3 unique subgraphs.
-    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
-    assert counters["aot_autograd"]["autograd_cache_hit"] == 0
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Cold start and warm start tests for vLLM-compile.
+
+Cold start runs in a forked child (must fork before CUDA init) which
+populates on-disk caches and asserts cold-start counters.  Warm start
+then runs in the parent with clean in-memory state but populated caches.
+"""
+
+import multiprocessing as mp
+
+from torch._dynamo.utils import counters
+
+from vllm.compilation.counter import compilation_counter
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+MODEL = "microsoft/Phi-tiny-MoE-instruct"
+
+
+def _run_vllm(vllm_runner):
+    with vllm_runner(
+        MODEL,
+        trust_remote_code=False,
+        max_model_len=256,
+        max_num_batched_tokens=1024,
+        load_format="dummy",
+        compilation_config=CompilationConfig(
+            mode=CompilationMode.VLLM_COMPILE,
+            cudagraph_mode=CUDAGraphMode.NONE,
+        ),
+        num_gpu_blocks_override=8,
+    ):
+        pass
+
+
+def _cold_start(vllm_runner):
+    counters.clear()
+    with compilation_counter.expect(
+        num_compiled_artifacts_saved=3,
+        num_compiled_artifacts_loaded=0,
+    ):
+        _run_vllm(vllm_runner)
+    assert counters["aot_autograd"]["total"] == 33
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 0
+
+
+def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    # Cold start in a forked child (must fork before CUDA init).
+    # This model has 32 identical transformer layers which produce
+    # 33 subgraphs after splitting on attention — only 3 are unique.
+    ctx = mp.get_context("fork")
+    p = ctx.Process(target=_cold_start, args=(vllm_runner,))
+    p.start()
+    p.join()
+    assert p.exitcode == 0, "Cold-start child failed"
+
+    # Warm start — compiled artifacts loaded from disk cache.
+    counters.clear()
+    with compilation_counter.expect(
+        num_compiled_artifacts_loaded=3,
+        # TODO: warm start should not save any artifacts
+        # https://github.com/vllm-project/vllm/issues/35708
+        num_compiled_artifacts_saved=1,
+    ):
+        _run_vllm(vllm_runner)
+    assert counters["aot_autograd"]["total"] == 30
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 0
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 1