Revert "[torch.compile] Significantly speed up cold start times" (#33820)

Signed-off-by: Richard Zou <zou3519@gmail.com>
2026-02-04 13:59:59 -08:00
parent 535de06cb1
commit 9f14c9224d
3 changed files with 21 additions and 41 deletions
--- a/tests/compile/test_cold_start.py
+++ b/tests/compile/test_cold_start.py
@@ -37,13 +37,12 @@ def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
    # The forward pass consists of 32 transformer layers.
    # Then, we split on the attention operation. This results in
    # 33 subgraphs (not including the attention operation).
-    # We then standalone_compile the unique subgraphs.
+    # The 33 subgraphs then get standalone_compile'd.
    #
    # There are actually only 3 unique subgraphs for this model
    # (all of its transformer layers are the same modulo weights);
    # this is true for most vLLM models.
-    # So we test that during cold start, only 3 subgraphs are compiled
-    # These 3 subgraphs should cache miss, and then there should be
-    # no other compilation (so no cache hits).
+    # So we test that during cold start, the aot_autograd cache
+    # misses for 3 subgraphs and hits for the rest.
    assert counters["aot_autograd"]["autograd_cache_miss"] == 3
-    assert counters["aot_autograd"]["autograd_cache_hit"] == 0
+    assert counters["aot_autograd"]["autograd_cache_hit"] == 30