diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 22aae54ed..1f491a3a4 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -155,3 +155,4 @@ The interface for the model/module may change during vLLM's development. If you - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.v1.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. - `seed_everything` platform interface is deprecated. It has been removed in v0.16.0. Please use `vllm.utils.torch_utils.set_random_seed` instead. + - `prompt` in `Platform.validate_request` is deprecated and will be removed in v0.18.0. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index cfaf03e2d..57fd4b67c 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -519,7 +519,6 @@ class LLM: ), params=seq_params, lora_requests=seq_lora_requests, - tokenization_kwargs=tokenization_kwargs, priorities=seq_priority, ) @@ -1813,7 +1812,6 @@ class LLM: params=seq_params, use_tqdm=use_tqdm, lora_requests=seq_lora_requests, - tokenization_kwargs=tokenization_kwargs, priorities=seq_priority, ) @@ -1872,7 +1870,6 @@ class LLM: params=seq_params, lora_requests=seq_lora_requests, use_tqdm=use_tqdm, - tokenization_kwargs=tokenization_kwargs, ) def _render_and_run_requests( @@ -1881,7 +1878,6 @@ class LLM: params: Sequence[SamplingParams | PoolingParams], *, lora_requests: Sequence[LoRARequest | None] | None = None, - tokenization_kwargs: dict[str, Any] | None = None, priorities: Sequence[int] | None = None, use_tqdm: bool | Callable[..., tqdm] = True, ): @@ -1899,7 +1895,6 @@ class LLM: prompts=prompts, params=params, lora_requests=lora_requests, - tokenization_kwargs=tokenization_kwargs, priorities=priorities, ) @@ -1911,7 +1906,6 @@ class LLM: params: Sequence[SamplingParams | PoolingParams], *, lora_requests: Sequence[LoRARequest | None] | None = None, - tokenization_kwargs: dict[str, Any] | None = None, priorities: Sequence[int] | None = None, ) -> list[str]: added_request_ids: list[str] = [] @@ -1922,7 +1916,6 @@ class LLM: prompt, params[i], lora_request=None if lora_requests is None else lora_requests[i], - tokenization_kwargs=tokenization_kwargs, priority=0 if priorities is None else priorities[i], ) added_request_ids.append(request_id) @@ -1938,7 +1931,6 @@ class LLM: prompt: ProcessorInputs, params: SamplingParams | PoolingParams, lora_request: LoRARequest | None = None, - tokenization_kwargs: dict[str, Any] | None = None, priority: int = 0, ) -> str: if isinstance(params, SamplingParams): @@ -1947,27 +1939,11 @@ class LLM: request_id = str(next(self.request_counter)) - if params.truncate_prompt_tokens is not None: - params_type = type(params).__name__ - warnings.warn( - f"The `truncate_prompt_tokens` parameter in `{params_type}` " - "is deprecated and will be removed in v0.16. " - "Please pass it via `tokenization_kwargs` instead.", - DeprecationWarning, - stacklevel=2, - ) - - tokenization_kwargs = merge_kwargs( - tokenization_kwargs, - dict(truncate_prompt_tokens=params.truncate_prompt_tokens), - ) - return self.llm_engine.add_request( request_id, prompt, params, lora_request=lora_request, - tokenization_kwargs=tokenization_kwargs, priority=priority, ) diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index cef78e525..6794c05f5 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -17,7 +17,7 @@ if TYPE_CHECKING: from torch.distributed import PrefixStore, ProcessGroup from vllm.config import VllmConfig - from vllm.inputs import ProcessorInputs, PromptType + from vllm.inputs import ProcessorInputs from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -568,9 +568,8 @@ class Platform: @classmethod def validate_request( cls, - prompt: "PromptType | ProcessorInputs", - params: "SamplingParams | PoolingParams", processed_inputs: "ProcessorInputs", + params: "SamplingParams | PoolingParams", ) -> None: """Raises if this request is unsupported on this platform""" diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index bb9715bbd..df8e994da 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -27,7 +27,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import STREAM_FINISHED, PoolingRequestOutput, RequestOutput from vllm.plugins.io_processors import get_io_processor from vllm.pooling_params import PoolingParams -from vllm.renderers import merge_kwargs, renderer_from_config +from vllm.renderers import renderer_from_config from vllm.renderers.inputs.preprocess import extract_prompt_components from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.tasks import SupportedTask @@ -319,21 +319,6 @@ class AsyncLLM(EngineClient): "prompt logprobs" ) - if params.truncate_prompt_tokens is not None: - params_type = type(params).__name__ - warnings.warn( - f"The `truncate_prompt_tokens` parameter in `{params_type}` " - "is deprecated and will be removed in v0.16. " - "Please pass it via `tokenization_kwargs` instead.", - DeprecationWarning, - stacklevel=2, - ) - - tokenization_kwargs = merge_kwargs( - tokenization_kwargs, - dict(truncate_prompt_tokens=params.truncate_prompt_tokens), - ) - if isinstance(prompt, AsyncGenerator): if reasoning_ended is not None: raise NotImplementedError @@ -353,6 +338,12 @@ class AsyncLLM(EngineClient): # Convert Input --> Request. if isinstance(prompt, EngineCoreRequest): + logger.warning_once( + "Passing EngineCoreRequest to AsyncLLM.generate() and .add_requests() " + "is deprecated and will be removed in v0.18. You should instead pass " + "the outputs of Renderer.render_cmpl() or Renderer.render_chat()." + ) + request = prompt if request_id != request.request_id: logger.warning_once( diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 4aaa26533..be221e486 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time +import warnings from collections.abc import Mapping from typing import Any, Literal @@ -28,6 +29,7 @@ from vllm.sampling_params import SamplingParams from vllm.tasks import POOLING_TASKS, SupportedTask from vllm.tokenizers import TokenizerLike from vllm.utils import length_from_prompt_token_ids_or_embeds, random_uuid +from vllm.utils.func_utils import supports_kw from vllm.utils.jsontree import json_iter_leaves from vllm.v1.engine import EngineCoreRequest @@ -72,6 +74,33 @@ class InputProcessor: mm_registry=mm_registry, ) + from vllm.platforms import current_platform + + platform_validate_request = current_platform.validate_request + if supports_kw(platform_validate_request, "prompt"): + logger.warning_once( + "The signature of Platform.validate_request has changed from " + "`(cls, prompt, params, processed_inputs) -> None` to " + "`(cls, processed_inputs, params) -> None`. The old signature " + "will no longer be supported starting from v0.18." + ) + + orig_validate_request = platform_validate_request + + def compat_validate_request( + processed_inputs: ProcessorInputs, + params: SamplingParams | PoolingParams, + ): + return orig_validate_request( + processed_inputs, + params, + processed_inputs, # type: ignore + ) # type: ignore + + platform_validate_request = compat_validate_request + + self._platform_validate_request = platform_validate_request + @property def tokenizer(self) -> TokenizerLike | None: return self.renderer.tokenizer @@ -87,6 +116,16 @@ class InputProcessor: supported_tasks: tuple[SupportedTask, ...] | None, ): """Raise `ValueError` if SamplingParams or PoolingParams is not valid.""" + if params.truncate_prompt_tokens is not None: + params_type = type(params).__name__ + warnings.warn( + f"The `truncate_prompt_tokens` parameter in `{params_type}` " + "is deprecated and will be removed in v0.17. " + "Please pass it via `tokenization_kwargs` instead.", + DeprecationWarning, + stacklevel=2, + ) + if isinstance(params, SamplingParams): params.verify( self.model_config, @@ -211,11 +250,24 @@ class InputProcessor: ) if isinstance(prompt, dict) and "type" in prompt: + if tokenization_kwargs: + logger.warning_once( + "Passing tokenization_kwargs to InputProcessor is deprecated " + "and will be removed in v0.18. You should instead pass " + "them to Renderer.render_cmpl() or Renderer.render_chat()." + ) + if arrival_time is None: arrival_time = prompt.get("arrival_time", time.time()) # type: ignore[assignment] processed_inputs: ProcessorInputs = prompt # type: ignore[assignment] else: + logger.warning_once( + "Passing raw prompts to InputProcessor is deprecated " + "and will be removed in v0.18. You should instead pass " + "the outputs of Renderer.render_cmpl() or Renderer.render_chat()." + ) + if arrival_time is None: arrival_time = time.time() @@ -224,13 +276,7 @@ class InputProcessor: tokenization_kwargs=tokenization_kwargs, ) - from vllm.platforms import current_platform - - current_platform.validate_request( - prompt=prompt, - params=params, - processed_inputs=processed_inputs, - ) + self._platform_validate_request(processed_inputs, params) encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) self._validate_model_inputs(encoder_inputs, decoder_inputs) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c4cf6baee..c4f0442f3 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -234,10 +234,16 @@ class LLMEngine: # Process raw inputs into the request. if isinstance(prompt, EngineCoreRequest): + logger.warning_once( + "Passing EngineCoreRequest to LLMEngine.generate() and .add_requests() " + "is deprecated and will be removed in v0.18. You should instead pass " + "the outputs of Renderer.render_cmpl() or Renderer.render_chat()." + ) + request = prompt if request_id != request.request_id: logger.warning_once( - "AsyncLLM.add_request() was passed a request_id parameter that " + "LLMEngine.add_request() was passed a request_id parameter that " "does not match the EngineCoreRequest.request_id attribute. The " "latter will be used, and the former will be ignored." )