[torch.compile] store inductor compiled Python file (#12182)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-19 16:27:26 +08:00
parent 630eb5b5ce
commit e66faf4809
2 changed files with 60 additions and 33 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2862,17 +2862,8 @@ class CompilationConfig(BaseModel):
                    "vllm.unified_attention_with_output",
                ]
            else:
-                # v0 can use full graph compilation without splitting,
-                # splitting is optional.
-                # right now we still need it. kv cache shape
-                # will be included in the graph if we don't split
-                # the graph.
-                # TODO: hide kv cache in static forward context
-                # so that inductor does not see it.
-                self.splitting_ops = [
-                    "vllm.unified_attention",
-                    "vllm.unified_attention_with_output",
-                ]
+                # v0 uses full graph compilation
+                self.splitting_ops = []

        for k, v in self.inductor_passes.items():
            if not isinstance(v, str):