[compile] Add compiled artifact counter for VLLM_USE_MEGA_AOT_ARTIFACT=1. (#37589)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
This commit is contained in:
Zhengxu Chen
2026-03-20 12:22:46 -04:00
committed by GitHub
parent 880be2b1b8
commit 2e089b96a8
2 changed files with 17 additions and 2 deletions

View File

@@ -9,11 +9,15 @@ then runs in the parent with clean in-memory state but populated caches.
import multiprocessing as mp
import pytest
from torch._dynamo.utils import counters
import vllm.envs as envs
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
from ..utils import fork_new_process_for_each_test
MODEL = "microsoft/Phi-tiny-MoE-instruct"
@@ -45,8 +49,11 @@ def _cold_start(vllm_runner):
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
@fork_new_process_for_each_test
@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
@@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
num_compiled_artifacts_saved=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 30
if envs.VLLM_USE_MEGA_AOT_ARTIFACT:
# MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
# subgraphs.
assert counters["aot_autograd"]["total"] == 0
else:
assert counters["aot_autograd"]["total"] == 30
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0

View File

@@ -17,6 +17,7 @@ from torch.utils import _pytree as pytree
import vllm.envs as envs
from vllm.compilation.compiler_interface import get_inductor_factors
from vllm.compilation.counter import compilation_counter
from vllm.config import VllmConfig, get_current_vllm_config
from vllm.config.utils import hash_factors
from vllm.logger import init_logger
@@ -61,6 +62,7 @@ class StandaloneCompiledArtifacts:
self.submodule_bytes[f"{submod_name}_{shape}"] = hex_digest
if hex_digest not in self.submodule_bytes_store:
self.submodule_bytes_store[hex_digest] = entry
compilation_counter.num_compiled_artifacts_saved += 1
logger.debug(
"inserting new artifact for submod %s with shape %s "
"(%s bytes) at hash %s",
@@ -124,6 +126,7 @@ class StandaloneCompiledArtifacts:
def _load_entry(entry_bytes: bytes) -> AOTCompiledArtifact:
entry = pickle.loads(entry_bytes)
compilation_counter.num_compiled_artifacts_loaded += 1
return AOTCompiledArtifact.deserialize(entry)
with concurrent.futures.ThreadPoolExecutor() as executor: