[compile] Add compiled artifact counter for VLLM_USE_MEGA_AOT_ARTIFACT=1. (#37589)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
2026-03-20 12:22:46 -04:00
parent 880be2b1b8
commit 2e089b96a8
2 changed files with 17 additions and 2 deletions
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
@@ -9,11 +9,15 @@ then runs in the parent with clean in-memory state but populated caches.

 import multiprocessing as mp

+import pytest
 from torch._dynamo.utils import counters

+import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode

+from ..utils import fork_new_process_for_each_test
+
 MODEL = "microsoft/Phi-tiny-MoE-instruct"


@@ -45,8 +49,11 @@ def _cold_start(vllm_runner):
    assert counters["aot_autograd"]["autograd_cache_hit"] == 0


-def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
+def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)

    # Cold start in a forked child (must fork before CUDA init).
    # This model has 32 identical transformer layers which produce
@@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
        num_compiled_artifacts_saved=0,
    ):
        _run_vllm(vllm_runner)
-    assert counters["aot_autograd"]["total"] == 30
+    if envs.VLLM_USE_MEGA_AOT_ARTIFACT:
+        # MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
+        # subgraphs.
+        assert counters["aot_autograd"]["total"] == 0
+    else:
+        assert counters["aot_autograd"]["total"] == 30
    assert counters["aot_autograd"]["autograd_cache_miss"] == 0
    assert (
        counters["aot_autograd"]["autograd_cache_hit"] == 0