[Deprecation] Remove inputs arg fallback in Engine classes (#18799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-06-09 10:19:56 +08:00
committed by GitHub
parent 2ffb9b6e07
commit e31ae3de36
4 changed files with 21 additions and 268 deletions

View File

@@ -6,12 +6,10 @@ import copy
import time
import weakref
from functools import partial
from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable,
List, Mapping, Optional, Set, Tuple, Type, Union, overload)
from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
Mapping, Optional, Set, Tuple, Type, Union)
from weakref import ReferenceType
from typing_extensions import deprecated
import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig)
@@ -36,7 +34,7 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import ExecuteModelRequest
from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, deprecate_kwargs, weak_bind
from vllm.utils import Device, weak_bind
logger = init_logger(__name__)
ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S
@@ -429,24 +427,6 @@ class _AsyncLLMEngine(LLMEngine):
return await (
self.get_tokenizer_group().get_lora_tokenizer_async(lora_request))
@overload
@deprecated("'inputs' will be renamed to 'prompt")
async def add_request_async(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> None:
...
@overload
async def add_request_async(
self,
request_id: str,
@@ -459,32 +439,10 @@ class _AsyncLLMEngine(LLMEngine):
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> None:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
async def add_request_async(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> None:
"""Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
"""
Async version of
[`add_request`][vllm.engine.llm_engine.LLMEngine.add_request].
"""
if lora_request is not None and not self.lora_config:
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
"not enabled!")
@@ -521,8 +479,7 @@ class _AsyncLLMEngine(LLMEngine):
params = await build_guided_decoding_logits_processor_async(
sampling_params=params,
tokenizer=await self.get_tokenizer_async(lora_request),
default_guided_backend=self.decoding_config.
guided_decoding_backend,
default_guided_backend=self.decoding_config.backend,
reasoning_backend=self.decoding_config.reasoning_backend,
model_config=self.model_config)
@@ -894,28 +851,7 @@ class AsyncLLMEngine(EngineClient):
raise
await asyncio.sleep(0)
# This method does not need to be async, but kept that way
# for backwards compatibility.
@overload
@deprecated("'inputs' will be renamed to 'prompt")
def add_request(
self,
request_id: str,
*,
inputs: PromptType,
params: Union[SamplingParams, PoolingParams],
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> Coroutine[None, None, AsyncGenerator[Union[
RequestOutput, PoolingRequestOutput], None]]:
...
@overload
def add_request(
async def add_request(
self,
request_id: str,
prompt: PromptType,
@@ -926,32 +862,7 @@ class AsyncLLMEngine(EngineClient):
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
) -> Coroutine[None, None, AsyncGenerator[Union[
RequestOutput, PoolingRequestOutput], None]]:
...
@deprecate_kwargs(
"inputs",
additional_message="Please use the 'prompt' parameter instead.",
)
async def add_request(
self,
request_id: str,
prompt: Optional[PromptType] = None,
params: Optional[Union[SamplingParams, PoolingParams]] = None,
arrival_time: Optional[float] = None,
lora_request: Optional[LoRARequest] = None,
trace_headers: Optional[Mapping[str, str]] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
priority: int = 0,
data_parallel_rank: Optional[int] = None,
*,
inputs: Optional[PromptType] = None, # DEPRECATED
) -> AsyncGenerator[Union[RequestOutput, PoolingRequestOutput], None]:
if inputs is not None:
prompt = inputs
assert prompt is not None and params is not None
if not self.is_running:
if self.start_engine_loop:
self.start_background_loop()