[fix][torch.compile] Fix cold-start compilation time increase by adding kv cache update to splitting ops (#33441)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Richard Zou <zou3519@gmail.com>
2026-01-31 09:48:34 -05:00
parent 793af538a3
commit 15f40b20aa
4 changed files with 127 additions and 1 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -943,6 +943,15 @@ class CompilationConfig:
                # for details. Make a copy to avoid mutating the class-level
                # list via reference.
                self.splitting_ops = list(self._attention_ops)
+
+                # unified_kv_cache_update has a string param that prevents Inductor
+                # from reusing piecewise graphs. Remove it from the compiled graph.
+                # This has the side-effect of excluding cache from cudagraphs but
+                # that doesn't seem to affect performance.
+                # https://github.com/vllm-project/vllm/issues/33267
+                if not self.use_inductor_graph_partition:
+                    self.splitting_ops.append("vllm::unified_kv_cache_update")
+
            elif len(self.splitting_ops) == 0:
                if (
                    self.cudagraph_mode == CUDAGraphMode.PIECEWISE