diff --git a/tests/compile/test_startup.py b/tests/compile/test_startup.py
index 545299565..32a586011 100644
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
@@ -9,11 +9,15 @@ then runs in the parent with clean in-memory state but populated caches.
 
 import multiprocessing as mp
 
+import pytest
 from torch._dynamo.utils import counters
 
+import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
 
+from ..utils import fork_new_process_for_each_test
+
 MODEL = "microsoft/Phi-tiny-MoE-instruct"
 
 
@@ -45,8 +49,11 @@ def _cold_start(vllm_runner):
     assert counters["aot_autograd"]["autograd_cache_hit"] == 0
 
 
-def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
+def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)
 
     # Cold start in a forked child (must fork before CUDA init).
     # This model has 32 identical transformer layers which produce
@@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
         num_compiled_artifacts_saved=0,
     ):
         _run_vllm(vllm_runner)
-    assert counters["aot_autograd"]["total"] == 30
+    if envs.VLLM_USE_MEGA_AOT_ARTIFACT:
+        # MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
+        # subgraphs.
+        assert counters["aot_autograd"]["total"] == 0
+    else:
+        assert counters["aot_autograd"]["total"] == 30
     assert counters["aot_autograd"]["autograd_cache_miss"] == 0
     assert (
         counters["aot_autograd"]["autograd_cache_hit"] == 0
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 1f5a87304..c089f02a3 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -17,6 +17,7 @@ from torch.utils import _pytree as pytree
 
 import vllm.envs as envs
 from vllm.compilation.compiler_interface import get_inductor_factors
+from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
@@ -61,6 +62,7 @@ class StandaloneCompiledArtifacts:
         self.submodule_bytes[f"{submod_name}_{shape}"] = hex_digest
         if hex_digest not in self.submodule_bytes_store:
             self.submodule_bytes_store[hex_digest] = entry
+            compilation_counter.num_compiled_artifacts_saved += 1
             logger.debug(
                 "inserting new artifact for submod %s with shape %s "
                 "(%s bytes) at hash %s",
@@ -124,6 +126,7 @@ class StandaloneCompiledArtifacts:
 
         def _load_entry(entry_bytes: bytes) -> AOTCompiledArtifact:
             entry = pickle.loads(entry_bytes)
+            compilation_counter.num_compiled_artifacts_loaded += 1
             return AOTCompiledArtifact.deserialize(entry)
 
         with concurrent.futures.ThreadPoolExecutor() as executor: