[Model Runner V2] Refactor dummy_run (#32533)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2026-01-19 14:50:59 -08:00
committed by GitHub
parent 4a5299c93f
commit 43fada5360
3 changed files with 34 additions and 55 deletions

View File

@@ -13,7 +13,6 @@ from vllm.config.compilation import CUDAGraphMode
from vllm.distributed.parallel_state import graph_capture, is_global_first_rank
from vllm.forward_context import set_forward_context
from vllm.v1.attention.backend import AttentionMetadataBuilder
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
from vllm.v1.worker.gpu.block_table import BlockTables
@@ -60,12 +59,12 @@ class CudaGraphManager:
def get_cudagraph_size(
self,
scheduler_output: SchedulerOutput,
num_tokens_after_padding: int,
num_tokens_per_request: Iterable[int],
) -> int | None:
return get_cudagraph_size(
num_tokens_after_padding,
scheduler_output.num_scheduled_tokens.values(),
num_tokens_per_request,
self.cudagraph_sizes,
self.cudagraph_mode,
)

View File

@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import time
from collections.abc import Iterable
from copy import deepcopy
from typing import Any
@@ -288,47 +289,25 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
skip_attn: bool = True,
**kwargs,
) -> tuple[torch.Tensor, torch.Tensor]:
# Create a dummy scheduler output.
num_reqs = min(num_tokens, self.max_num_reqs)
input_batch = InputBatch.make_dummy(
num_reqs=num_reqs,
num_tokens=num_tokens,
input_buffers=self.input_buffers,
device=self.device,
)
if self.uses_mrope:
input_batch.mrope_positions = self.mrope_states.mrope_positions[
:, :num_tokens
]
if self.supports_mm_inputs:
input_batch.inputs_embeds = self.encoder_runner.inputs_embeds[:num_tokens]
if not skip_attn:
self.prepare_dummy_attn_metadata(input_batch)
num_tokens_per_request = [num_tokens // num_reqs] * num_reqs
num_tokens_per_request[-1] += num_tokens % num_reqs
assert sum(num_tokens_per_request) == num_tokens
num_scheduled_tokens = {
f"_dummy_req_{i}": num_tokens_per_request[i] for i in range(num_reqs)
}
dummy_scheduler_output = SchedulerOutput.make_empty()
dummy_scheduler_output.total_num_scheduled_tokens = num_tokens
dummy_scheduler_output.num_scheduled_tokens = num_scheduled_tokens
dp_size = self.parallel_config.data_parallel_size
num_tokens_across_dp = make_num_tokens_across_dp(dp_size, num_tokens)
num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32)
positions = input_batch.positions
if self.uses_mrope:
positions = input_batch.mrope_positions
with (
self.maybe_dummy_run_with_lora(
self.lora_config,
input_batch.num_scheduled_tokens,
num_sampled_tokens,
),
set_forward_context(
input_batch.attn_metadata,
self.vllm_config,
num_tokens=num_tokens,
num_tokens_across_dp=num_tokens_across_dp,
),
):
hidden_states = self.model(
input_ids=input_batch.input_ids,
positions=positions,
inputs_embeds=input_batch.inputs_embeds,
)
sample_hidden_states = hidden_states[input_batch.logits_indices]
# Execute the model.
self.execute_model(
dummy_scheduler_output, dummy_run=True, skip_attn_for_dummy_run=skip_attn
)
assert self.execute_model_state is not None
hidden_states, input_batch = self.execute_model_state
sample_hidden_states = hidden_states[input_batch.logits_indices]
return hidden_states, sample_hidden_states
@torch.inference_mode()
@@ -893,9 +872,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
def get_cudagraph_and_dp_padding(
self,
scheduler_output: SchedulerOutput,
total_num_scheduled_tokens: int,
num_tokens_per_request: Iterable[int],
) -> tuple[CUDAGraphMode, int, torch.Tensor | None]:
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
dp_size = self.parallel_config.data_parallel_size
if dp_size == 1:
# No DP. Only consider CUDA graphs.
@@ -904,7 +883,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return CUDAGraphMode.NONE, 0, None
cudagraph_size = self.cudagraph_manager.get_cudagraph_size(
scheduler_output, total_num_scheduled_tokens
total_num_scheduled_tokens, num_tokens_per_request
)
if cudagraph_size is not None:
# Use full CUDA graph.
@@ -919,7 +898,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
cudagraph_size_before_dp: int | None = 0
else:
cudagraph_size_before_dp = self.cudagraph_manager.get_cudagraph_size(
scheduler_output, total_num_scheduled_tokens
total_num_scheduled_tokens, num_tokens_per_request
)
if cudagraph_size_before_dp is None:
cudagraph_size_before_dp = -1
@@ -951,6 +930,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
scheduler_output: SchedulerOutput,
intermediate_tensors: Any | None = None,
dummy_run: bool = False,
skip_attn_for_dummy_run: bool = False,
) -> ModelRunnerOutput | None:
assert intermediate_tensors is None
if not dummy_run:
@@ -965,7 +945,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
return EMPTY_MODEL_RUNNER_OUTPUT
cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp = (
self.get_cudagraph_and_dp_padding(scheduler_output)
self.get_cudagraph_and_dp_padding(
scheduler_output.total_num_scheduled_tokens,
scheduler_output.num_scheduled_tokens.values(),
)
)
if num_tokens_after_padding == 0:
# All DP ranks have zero tokens to run.
@@ -999,7 +982,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
: input_batch.num_tokens_after_padding
]
else:
# No actual tokens to run. A dummy run for DP.
# No actual tokens to run. A dummy run for DP or memory profiling.
num_reqs = min(num_tokens_after_padding, self.max_num_reqs)
input_batch = InputBatch.make_dummy(
num_reqs=num_reqs,
@@ -1011,7 +994,9 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
input_batch.mrope_positions = self.mrope_states.mrope_positions[
:, :num_tokens_after_padding
]
self.prepare_dummy_attn_metadata(input_batch)
if not skip_attn_for_dummy_run:
self.prepare_dummy_attn_metadata(input_batch)
# FIXME(woosuk): Fix warmup for LoRA.
# Run model.
if cudagraph_mode == CUDAGraphMode.FULL:

View File

@@ -662,12 +662,7 @@ class Worker(WorkerBase):
self.profiler.stop()
def execute_dummy_batch(self) -> None:
if self.use_v2_model_runner:
self.model_runner.execute_model(
SchedulerOutput.make_empty(), dummy_run=True
)
else:
self.model_runner._dummy_run(1, uniform_decode=True)
self.model_runner._dummy_run(1, uniform_decode=True)
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_runner.add_lora(lora_request)