GPU Model Runner V2 (#25266)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
@@ -41,7 +41,7 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
||||
from vllm.v1.core.sched.output import GrammarOutput
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import (
|
||||
@@ -58,7 +58,6 @@ logger = init_logger(__name__)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
|
||||
|
||||
class Worker(WorkerBase):
|
||||
@@ -101,6 +100,8 @@ class Worker(WorkerBase):
|
||||
else:
|
||||
self.profiler = None
|
||||
|
||||
self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER
|
||||
|
||||
def sleep(self, level: int = 1) -> None:
|
||||
from vllm.device_allocator.cumem import CuMemAllocator
|
||||
|
||||
@@ -237,9 +238,17 @@ class Worker(WorkerBase):
|
||||
raise RuntimeError(f"Not support device type: {self.device_config.device}")
|
||||
|
||||
# Construct the model runner
|
||||
self.model_runner: GPUModelRunner = GPUModelRunner(
|
||||
self.vllm_config, self.device
|
||||
)
|
||||
if self.use_v2_model_runner:
|
||||
from vllm.v1.worker.gpu.model_runner import (
|
||||
GPUModelRunner as GPUModelRunnerV2,
|
||||
)
|
||||
|
||||
# HACK(woosuk): This is a temporary fix to avoid type errors.
|
||||
self.model_runner: GPUModelRunner = GPUModelRunnerV2( # type: ignore
|
||||
self.vllm_config, self.device
|
||||
)
|
||||
else:
|
||||
self.model_runner = GPUModelRunner(self.vllm_config, self.device)
|
||||
|
||||
if self.rank == 0:
|
||||
# If usage stat is enabled, collect relevant info.
|
||||
@@ -573,7 +582,12 @@ class Worker(WorkerBase):
|
||||
self.profiler.stop()
|
||||
|
||||
def execute_dummy_batch(self) -> None:
|
||||
self.model_runner._dummy_run(1, uniform_decode=True)
|
||||
if self.use_v2_model_runner:
|
||||
self.model_runner.execute_model(
|
||||
SchedulerOutput.make_empty(), dummy_run=True
|
||||
)
|
||||
else:
|
||||
self.model_runner._dummy_run(1, uniform_decode=True)
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.model_runner.add_lora(lora_request)
|
||||
|
||||
Reference in New Issue
Block a user