[torch.compile] Improve cold and warm start compile tests (#35709)
Signed-off-by: Richard Zou <zou3519@gmail.com>
This commit is contained in:
@@ -1,48 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from torch._dynamo.utils import counters
|
||||
|
||||
from vllm import LLM
|
||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
|
||||
|
||||
|
||||
def test_moe_compilation_cold_start(monkeypatch, use_fresh_inductor_cache):
|
||||
# Run in same process so we can access PyTorch's internal counters
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
# I'm not sure if this is going to affect the numbers
|
||||
monkeypatch.setenv("VLLM_USE_AOT_COMPILE", "0")
|
||||
|
||||
# Force cold compilation
|
||||
monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
|
||||
|
||||
compilation_config = CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
cudagraph_mode=CUDAGraphMode.NONE, # make the model loading faster
|
||||
)
|
||||
|
||||
counters.clear()
|
||||
|
||||
_ = LLM(
|
||||
model="microsoft/Phi-tiny-MoE-instruct",
|
||||
max_model_len=256,
|
||||
load_format="dummy", # make the model loading faster
|
||||
compilation_config=compilation_config,
|
||||
num_gpu_blocks_override=8, # make the model loading faster
|
||||
)
|
||||
|
||||
# vLLM-compile cold start is special. By default, we do
|
||||
# one full dynamo capture of the entire forward pass.
|
||||
# The forward pass consists of 32 transformer layers.
|
||||
# Then, we split on the attention operation. This results in
|
||||
# 33 subgraphs (not including the attention operation).
|
||||
# We then generate compiled artifacts for the unique subgraphs.
|
||||
#
|
||||
# There are actually only 3 unique subgraphs for this model
|
||||
# (all of its transformer layers are the same modulo weights);
|
||||
# this is true for most vLLM models.
|
||||
# So we test that during cold start, we are only compling
|
||||
# for 3 unique subgraphs.
|
||||
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
|
||||
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
|
||||
71
tests/compile/test_startup.py
Normal file
71
tests/compile/test_startup.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Cold start and warm start tests for vLLM-compile.
|
||||
|
||||
Cold start runs in a forked child (must fork before CUDA init) which
|
||||
populates on-disk caches and asserts cold-start counters. Warm start
|
||||
then runs in the parent with clean in-memory state but populated caches.
|
||||
"""
|
||||
|
||||
import multiprocessing as mp
|
||||
|
||||
from torch._dynamo.utils import counters
|
||||
|
||||
from vllm.compilation.counter import compilation_counter
|
||||
from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
|
||||
|
||||
MODEL = "microsoft/Phi-tiny-MoE-instruct"
|
||||
|
||||
|
||||
def _run_vllm(vllm_runner):
|
||||
with vllm_runner(
|
||||
MODEL,
|
||||
trust_remote_code=False,
|
||||
max_model_len=256,
|
||||
max_num_batched_tokens=1024,
|
||||
load_format="dummy",
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
cudagraph_mode=CUDAGraphMode.NONE,
|
||||
),
|
||||
num_gpu_blocks_override=8,
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
def _cold_start(vllm_runner):
|
||||
counters.clear()
|
||||
with compilation_counter.expect(
|
||||
num_compiled_artifacts_saved=3,
|
||||
num_compiled_artifacts_loaded=0,
|
||||
):
|
||||
_run_vllm(vllm_runner)
|
||||
assert counters["aot_autograd"]["total"] == 33
|
||||
assert counters["aot_autograd"]["autograd_cache_miss"] == 3
|
||||
assert counters["aot_autograd"]["autograd_cache_hit"] == 0
|
||||
|
||||
|
||||
def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
|
||||
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
|
||||
|
||||
# Cold start in a forked child (must fork before CUDA init).
|
||||
# This model has 32 identical transformer layers which produce
|
||||
# 33 subgraphs after splitting on attention — only 3 are unique.
|
||||
ctx = mp.get_context("fork")
|
||||
p = ctx.Process(target=_cold_start, args=(vllm_runner,))
|
||||
p.start()
|
||||
p.join()
|
||||
assert p.exitcode == 0, "Cold-start child failed"
|
||||
|
||||
# Warm start — compiled artifacts loaded from disk cache.
|
||||
counters.clear()
|
||||
with compilation_counter.expect(
|
||||
num_compiled_artifacts_loaded=3,
|
||||
# TODO: warm start should not save any artifacts
|
||||
# https://github.com/vllm-project/vllm/issues/35708
|
||||
num_compiled_artifacts_saved=1,
|
||||
):
|
||||
_run_vllm(vllm_runner)
|
||||
assert counters["aot_autograd"]["total"] == 30
|
||||
assert counters["aot_autograd"]["autograd_cache_miss"] == 0
|
||||
assert counters["aot_autograd"]["autograd_cache_hit"] == 1
|
||||
Reference in New Issue
Block a user