[fix][torch.compile] Fix cold-start compilation time increase by adding kv cache update to splitting ops (#33441)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Richard Zou <zou3519@gmail.com>
2026-01-31 09:48:34 -05:00
parent 793af538a3
commit 15f40b20aa
4 changed files with 127 additions and 1 deletions
--- a/tests/compile/test_cold_start.py
+++ b/tests/compile/test_cold_start.py
@@ -0,0 +1,48 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from torch._dynamo.utils import counters
+
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+
+def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
+    # Run in same process so we can access PyTorch's internal counters
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+
+    # I'm not sure if this is going to affect the numbers
+    monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
+
+    # Force cold compilation
+    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+    compilation_config = CompilationConfig(
+        mode=CompilationMode.VLLM_COMPILE,
+        cudagraph_mode=CUDAGraphMode.NONE,  # make the model loading faster
+    )
+
+    counters.clear()
+
+    _ = LLM(
+        model="microsoft/Phi-tiny-MoE-instruct",
+        max_model_len=256,
+        load_format="dummy",  # make the model loading faster
+        compilation_config=compilation_config,
+        num_gpu_blocks_override=8,  # make the model loading faster
+    )
+
+    # vLLM-compile cold start is special. By default, we do
+    # one full dynamo capture of the entire forward pass.
+    # The forward pass consists of 32 transformer layers.
+    # Then, we split on the attention operation. This results in
+    # 33 subgraphs (not including the attention operation).
+    # The 33 subgraphs then get standalone_compile'd.
+    #
+    # There are actually only 3 unique subgraphs for this model
+    # (all of its transformer layers are the same modulo weights);
+    # this is true for most vLLM models.
+    # So we test that during cold start, the aot_autograd cache
+    # misses for 3 subgraphs and hits for the rest.
+    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 30