GPU Model Runner V2 (#25266)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-11-21 08:20:55 -08:00
committed by GitHub
parent 1f400c58b8
commit 30b44a1598
18 changed files with 2639 additions and 12 deletions

View File

@@ -41,7 +41,7 @@ from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
from vllm.v1.core.sched.output import GrammarOutput
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
from vllm.v1.outputs import (
@@ -58,7 +58,6 @@ logger = init_logger(__name__)
if TYPE_CHECKING:
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.v1.core.sched.output import SchedulerOutput
class Worker(WorkerBase):
@@ -101,6 +100,8 @@ class Worker(WorkerBase):
else:
self.profiler = None
self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
def sleep(self, level: int = 1) -> None:
from vllm.device_allocator.cumem import CuMemAllocator
@@ -237,9 +238,17 @@ class Worker(WorkerBase):
raise RuntimeError(f"Not support device type: {self.device_config.device}")
# Construct the model runner
self.model_runner: GPUModelRunner = GPUModelRunner(
self.vllm_config, self.device
)
if self.use_v2_model_runner:
from vllm.v1.worker.gpu.model_runner import (
GPUModelRunner as GPUModelRunnerV2,
)
# HACK(woosuk): This is a temporary fix to avoid type errors.
self.model_runner: GPUModelRunner = GPUModelRunnerV2( # type: ignore
self.vllm_config, self.device
)
else:
self.model_runner = GPUModelRunner(self.vllm_config, self.device)
if self.rank == 0:
# If usage stat is enabled, collect relevant info.
@@ -573,7 +582,12 @@ class Worker(WorkerBase):
self.profiler.stop()
def execute_dummy_batch(self) -> None:
self.model_runner._dummy_run(1, uniform_decode=True)
if self.use_v2_model_runner:
self.model_runner.execute_model(
SchedulerOutput.make_empty(), dummy_run=True
)
else:
self.model_runner._dummy_run(1, uniform_decode=True)
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_runner.add_lora(lora_request)