[compile] Add compiled artifact counter for VLLM_USE_MEGA_AOT_ARTIFACT=1. (#37589)

Signed-off-by: zhxchen17 <zhxchen17@fb.com>
This commit is contained in:
Zhengxu Chen
2026-03-20 12:22:46 -04:00
committed by GitHub
parent 880be2b1b8
commit 2e089b96a8
2 changed files with 17 additions and 2 deletions

View File

@@ -9,11 +9,15 @@ then runs in the parent with clean in-memory state but populated caches.
import multiprocessing as mp
import pytest
from torch._dynamo.utils import counters
import vllm.envs as envs
from vllm.compilation.counter import compilation_counter
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
from ..utils import fork_new_process_for_each_test
MODEL = "microsoft/Phi-tiny-MoE-instruct"
@@ -45,8 +49,11 @@ def _cold_start(vllm_runner):
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
@fork_new_process_for_each_test
@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)
# Cold start in a forked child (must fork before CUDA init).
# This model has 32 identical transformer layers which produce
@@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
num_compiled_artifacts_saved=0,
):
_run_vllm(vllm_runner)
assert counters["aot_autograd"]["total"] == 30
if envs.VLLM_USE_MEGA_AOT_ARTIFACT:
# MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
# subgraphs.
assert counters["aot_autograd"]["total"] == 0
else:
assert counters["aot_autograd"]["total"] == 30
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
assert (
counters["aot_autograd"]["autograd_cache_hit"] == 0