[Core] Async scheduling + structured outputs compatibility (#26866)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -6,6 +6,7 @@ import copy
|
||||
import gc
|
||||
import os
|
||||
from contextlib import AbstractContextManager, nullcontext
|
||||
from types import NoneType
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
import torch
|
||||
@@ -37,6 +38,7 @@ from vllm.sequence import IntermediateTensors
|
||||
from vllm.tasks import SupportedTask
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
||||
from vllm.v1.core.sched.output import GrammarOutput
|
||||
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import (
|
||||
@@ -508,11 +510,16 @@ class Worker(WorkerBase):
|
||||
def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
|
||||
return self.model_runner.get_supported_tasks()
|
||||
|
||||
@torch.inference_mode()
|
||||
def sample_tokens(
|
||||
self, grammar_output: "GrammarOutput"
|
||||
) -> ModelRunnerOutput | AsyncModelRunnerOutput:
|
||||
return self.model_runner.sample_tokens(grammar_output)
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
scheduler_output: "SchedulerOutput",
|
||||
) -> ModelRunnerOutput | AsyncModelRunnerOutput | None:
|
||||
self, scheduler_output: "SchedulerOutput"
|
||||
) -> ModelRunnerOutput | None:
|
||||
intermediate_tensors = None
|
||||
forward_pass = scheduler_output.total_num_scheduled_tokens > 0
|
||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||
@@ -531,13 +538,13 @@ class Worker(WorkerBase):
|
||||
)
|
||||
|
||||
output = self.model_runner.execute_model(scheduler_output, intermediate_tensors)
|
||||
if isinstance(output, (ModelRunnerOutput, AsyncModelRunnerOutput)):
|
||||
if isinstance(output, (ModelRunnerOutput, NoneType)):
|
||||
return output
|
||||
|
||||
assert isinstance(output, IntermediateTensors)
|
||||
parallel_config = self.vllm_config.parallel_config
|
||||
assert (
|
||||
parallel_config.distributed_executor_backend != ("external_launcher")
|
||||
parallel_config.distributed_executor_backend != "external_launcher"
|
||||
and not get_pp_group().is_last_rank
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user