diff --git a/tests/compile/test_startup.py b/tests/compile/test_startup.py index 545299565..32a586011 100644 --- a/tests/compile/test_startup.py +++ b/tests/compile/test_startup.py @@ -9,11 +9,15 @@ then runs in the parent with clean in-memory state but populated caches. import multiprocessing as mp +import pytest from torch._dynamo.utils import counters +import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode +from ..utils import fork_new_process_for_each_test + MODEL = "microsoft/Phi-tiny-MoE-instruct" @@ -45,8 +49,11 @@ def _cold_start(vllm_runner): assert counters["aot_autograd"]["autograd_cache_hit"] == 0 -def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache): +@fork_new_process_for_each_test +@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"]) +def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact): monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact) # Cold start in a forked child (must fork before CUDA init). # This model has 32 identical transformer layers which produce @@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache): num_compiled_artifacts_saved=0, ): _run_vllm(vllm_runner) - assert counters["aot_autograd"]["total"] == 30 + if envs.VLLM_USE_MEGA_AOT_ARTIFACT: + # MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on + # subgraphs. + assert counters["aot_autograd"]["total"] == 0 + else: + assert counters["aot_autograd"]["total"] == 30 assert counters["aot_autograd"]["autograd_cache_miss"] == 0 assert ( counters["aot_autograd"]["autograd_cache_hit"] == 0 diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 1f5a87304..c089f02a3 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -17,6 +17,7 @@ from torch.utils import _pytree as pytree import vllm.envs as envs from vllm.compilation.compiler_interface import get_inductor_factors +from vllm.compilation.counter import compilation_counter from vllm.config import VllmConfig, get_current_vllm_config from vllm.config.utils import hash_factors from vllm.logger import init_logger @@ -61,6 +62,7 @@ class StandaloneCompiledArtifacts: self.submodule_bytes[f"{submod_name}_{shape}"] = hex_digest if hex_digest not in self.submodule_bytes_store: self.submodule_bytes_store[hex_digest] = entry + compilation_counter.num_compiled_artifacts_saved += 1 logger.debug( "inserting new artifact for submod %s with shape %s " "(%s bytes) at hash %s", @@ -124,6 +126,7 @@ class StandaloneCompiledArtifacts: def _load_entry(entry_bytes: bytes) -> AOTCompiledArtifact: entry = pickle.loads(entry_bytes) + compilation_counter.num_compiled_artifacts_loaded += 1 return AOTCompiledArtifact.deserialize(entry) with concurrent.futures.ThreadPoolExecutor() as executor: