diff --git a/tests/entrypoints/pooling/embed/test_online.py b/tests/entrypoints/pooling/embed/test_online.py index 89341670c..adec62334 100644 --- a/tests/entrypoints/pooling/embed/test_online.py +++ b/tests/entrypoints/pooling/embed/test_online.py @@ -683,13 +683,13 @@ async def test_params_not_supported( @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) -async def test_normalize(server: RemoteOpenAIServer, model_name: str): - async def get_outputs(normalize): +async def test_use_activation(server: RemoteOpenAIServer, model_name: str): + async def get_outputs(use_activation): request_args = { "model": MODEL_NAME, "input": input_text, "encoding_format": "float", - "normalize": normalize, + "use_activation": use_activation, } response = requests.post(server.url_for("v1/embeddings"), json=request_args) @@ -697,9 +697,9 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str): return torch.tensor([x["embedding"] for x in outputs["data"]]) - default = await get_outputs(normalize=None) - w_normal = await get_outputs(normalize=True) - wo_normal = await get_outputs(normalize=False) + default = await get_outputs(use_activation=None) + w_normal = await get_outputs(use_activation=True) + wo_normal = await get_outputs(use_activation=False) assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal." assert not torch.allclose(w_normal, wo_normal, atol=1e-2), ( diff --git a/vllm/entrypoints/grpc_server.py b/vllm/entrypoints/grpc_server.py index 1fc3354a4..ec8f4804b 100755 --- a/vllm/entrypoints/grpc_server.py +++ b/vllm/entrypoints/grpc_server.py @@ -101,11 +101,15 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer): sampling_params = self._sampling_params_from_proto( request.sampling_params, stream=request.stream ) + tokenization_kwargs = self._tokenization_kwargs_from_proto( + request.sampling_params + ) async for output in self.async_llm.generate( prompt=prompt, sampling_params=sampling_params, request_id=request_id, + tokenization_kwargs=tokenization_kwargs, ): # Convert vLLM output to protobuf # For streaming, always send chunks @@ -308,9 +312,6 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer): seed=params.seed if params.HasField("seed") else None, include_stop_str_in_output=params.include_stop_str_in_output, logit_bias=dict(params.logit_bias) if params.logit_bias else None, - truncate_prompt_tokens=params.truncate_prompt_tokens - if params.HasField("truncate_prompt_tokens") - else None, structured_outputs=structured_outputs, # detokenize must be True if stop strings are used detokenize=bool(stop), @@ -319,6 +320,14 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer): else RequestOutputKind.FINAL_ONLY, ) + @staticmethod + def _tokenization_kwargs_from_proto( + params: vllm_engine_pb2.SamplingParams, + ) -> dict[str, int] | None: + if params.HasField("truncate_prompt_tokens"): + return {"truncate_prompt_tokens": params.truncate_prompt_tokens} + return None + @staticmethod def _chunk_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse: """ diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index ee78d4d48..b3260f914 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -import warnings from collections.abc import Callable, Iterable, Sequence from typing import TYPE_CHECKING, Any @@ -1030,7 +1029,6 @@ class LLM: prompts: PromptType | Sequence[PromptType] | DataPrompt, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, *, - truncate_prompt_tokens: int | None = None, use_tqdm: bool | Callable[..., tqdm] = True, lora_request: list[LoRARequest] | LoRARequest | None = None, pooling_task: PoolingTask | None = None, @@ -1088,20 +1086,6 @@ class LLM: "pooling model." ) - if truncate_prompt_tokens is not None: - warnings.warn( - "The `truncate_prompt_tokens` parameter in `LLM.encode()` " - "is deprecated and will be removed in v0.16. " - "Please pass it via `tokenization_kwargs` instead.", - DeprecationWarning, - stacklevel=2, - ) - - tokenization_kwargs = merge_kwargs( - tokenization_kwargs, - dict(truncate_prompt_tokens=truncate_prompt_tokens), - ) - if use_io_processor := (isinstance(prompts, dict) and "data" in prompts): if self.io_processor is None: raise ValueError( @@ -1185,7 +1169,6 @@ class LLM: self, prompts: PromptType | Sequence[PromptType], *, - truncate_prompt_tokens: int | None = None, use_tqdm: bool | Callable[..., tqdm] = True, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, lora_request: list[LoRARequest] | LoRARequest | None = None, @@ -1221,12 +1204,6 @@ class LLM: "Try converting the model using `--convert embed`." ) - if truncate_prompt_tokens is not None: - tokenization_kwargs = merge_kwargs( - tokenization_kwargs, - dict(truncate_prompt_tokens=truncate_prompt_tokens), - ) - items = self.encode( prompts, use_tqdm=use_tqdm, @@ -1294,7 +1271,6 @@ class LLM: /, *, pooling_params: PoolingParams | Sequence[PoolingParams] | None = None, - truncate_prompt_tokens: int | None = None, use_tqdm: bool | Callable[..., tqdm] = True, lora_request: list[LoRARequest] | LoRARequest | None = None, tokenization_kwargs: dict[str, Any] | None = None, @@ -1319,13 +1295,11 @@ class LLM: A list of `PoolingRequestOutput` objects containing the pooled hidden states in the same order as the input prompts. """ - return self.encode( prompts, use_tqdm=use_tqdm, lora_request=lora_request, pooling_params=pooling_params, - truncate_prompt_tokens=truncate_prompt_tokens, pooling_task="token_classify", tokenization_kwargs=tokenization_kwargs, ) @@ -1771,23 +1745,15 @@ class LLM: seq_prompts = prompt_to_seq(prompts) seq_params = self._params_to_seq(params, len(seq_prompts)) seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_prompts)) - seq_tok_kwargs = [ - merge_kwargs( - tokenization_kwargs, - dict(truncate_prompt_tokens=param.truncate_prompt_tokens), - ) - for param in seq_params - ] seq_priority = self._priority_to_seq(priority, len(prompts)) return self._render_and_add_requests( prompts=( - self._preprocess_cmpl_one(prompt, tok_kwargs) - for prompt, tok_kwargs in zip( - maybe_tqdm( - seq_prompts, use_tqdm=use_tqdm, desc="Rendering prompts" - ), - seq_tok_kwargs, + self._preprocess_cmpl_one(prompt, tokenization_kwargs) + for prompt in maybe_tqdm( + seq_prompts, + use_tqdm=use_tqdm, + desc="Rendering prompts", ) ), params=seq_params, @@ -1841,13 +1807,6 @@ class LLM: seq_convs = conversation_to_seq(messages) seq_params = self._params_to_seq(params, len(seq_convs)) seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_convs)) - seq_tok_kwargs = [ - merge_kwargs( - tokenization_kwargs, - dict(truncate_prompt_tokens=param.truncate_prompt_tokens), - ) - for param in seq_params - ] return self._render_and_run_requests( prompts=( @@ -1859,16 +1818,13 @@ class LLM: add_generation_prompt=add_generation_prompt, continue_final_message=continue_final_message, tools=tools, - tokenization_kwargs=tok_kwargs, + tokenization_kwargs=tokenization_kwargs, mm_processor_kwargs=mm_processor_kwargs, ) - for conversation, tok_kwargs in zip( - maybe_tqdm( - seq_convs, - use_tqdm=use_tqdm, - desc="Rendering conversations", - ), - seq_tok_kwargs, + for conversation in maybe_tqdm( + seq_convs, + use_tqdm=use_tqdm, + desc="Rendering conversations", ) ), params=seq_params, diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py index 12bbc44a0..edba28a59 100644 --- a/vllm/entrypoints/openai/chat_completion/protocol.py +++ b/vllm/entrypoints/openai/chat_completion/protocol.py @@ -490,7 +490,6 @@ class ChatCompletionRequest(OpenAIBaseModel): skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, - truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY, diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py index 02e6e0d03..222640439 100644 --- a/vllm/entrypoints/openai/completion/protocol.py +++ b/vllm/entrypoints/openai/completion/protocol.py @@ -302,7 +302,6 @@ class CompletionRequest(OpenAIBaseModel): skip_special_tokens=self.skip_special_tokens, spaces_between_special_tokens=self.spaces_between_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, - truncate_prompt_tokens=self.truncate_prompt_tokens, output_kind=RequestOutputKind.DELTA if self.stream else RequestOutputKind.FINAL_ONLY, diff --git a/vllm/entrypoints/openai/translations/__init__.py b/vllm/entrypoints/openai/translations/__init__.py deleted file mode 100644 index cf210d505..000000000 --- a/vllm/entrypoints/openai/translations/__init__.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import warnings - -warnings.warn( - "The 'vllm.entrypoints.openai.translations' module has been renamed to " - "'vllm.entrypoints.openai.speech_to_text'. Please update your imports. " - "This backward-compatible alias will be removed in version 0.17+.", - DeprecationWarning, - stacklevel=2, -) diff --git a/vllm/entrypoints/openai/translations/api_router.py b/vllm/entrypoints/openai/translations/api_router.py deleted file mode 100644 index 4a43bf8b9..000000000 --- a/vllm/entrypoints/openai/translations/api_router.py +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import warnings - -warnings.warn( - "'vllm.entrypoints.openai.translations.api_router' has been moved to " - "'vllm.entrypoints.openai.speech_to_text.api_router'. Please update your " - "imports. This backward-compatible alias will be removed in version 0.17+.", - DeprecationWarning, - stacklevel=2, -) - -from vllm.entrypoints.openai.speech_to_text.api_router import * # noqa: F401,F403,E402 diff --git a/vllm/entrypoints/openai/translations/protocol.py b/vllm/entrypoints/openai/translations/protocol.py deleted file mode 100644 index c8ec156d9..000000000 --- a/vllm/entrypoints/openai/translations/protocol.py +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import warnings - -warnings.warn( - "'vllm.entrypoints.openai.translations.protocol' has been moved to " - "'vllm.entrypoints.openai.speech_to_text.protocol'. Please update your " - "imports. This backward-compatible alias will be removed in version 0.17+.", - DeprecationWarning, - stacklevel=2, -) - -from vllm.entrypoints.openai.speech_to_text.protocol import * # noqa: F401,F403,E402 diff --git a/vllm/entrypoints/openai/translations/serving.py b/vllm/entrypoints/openai/translations/serving.py deleted file mode 100644 index 1749d6155..000000000 --- a/vllm/entrypoints/openai/translations/serving.py +++ /dev/null @@ -1,14 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import warnings - -warnings.warn( - "'vllm.entrypoints.openai.translations.serving' has been moved to " - "'vllm.entrypoints.openai.speech_to_text.serving'. Please update your " - "imports. This backward-compatible alias will be removed in version 0.17+.", - DeprecationWarning, - stacklevel=2, -) - -from vllm.entrypoints.openai.speech_to_text.serving import * # noqa: F401,F403,E402 diff --git a/vllm/entrypoints/openai/translations/speech_to_text.py b/vllm/entrypoints/openai/translations/speech_to_text.py deleted file mode 100644 index eb26c6a83..000000000 --- a/vllm/entrypoints/openai/translations/speech_to_text.py +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import warnings - -warnings.warn( - "'vllm.entrypoints.openai.translations.speech_to_text' has been moved to " - "'vllm.entrypoints.openai.speech_to_text.speech_to_text'. Please update " - "your imports. This backward-compatible alias will be removed in version " - "0.17+.", - DeprecationWarning, - stacklevel=2, -) - -from vllm.entrypoints.openai.speech_to_text.speech_to_text import * # noqa: F401,F403,E402 diff --git a/vllm/entrypoints/pooling/base/protocol.py b/vllm/entrypoints/pooling/base/protocol.py index 86dc12cbd..53945108d 100644 --- a/vllm/entrypoints/pooling/base/protocol.py +++ b/vllm/entrypoints/pooling/base/protocol.py @@ -190,10 +190,6 @@ class EmbedRequestMixin(EncodingRequestMixin): description="Whether to use activation for the pooler outputs. " "`None` uses the pooler's default, which is `True` in most cases.", ) - normalize: bool | None = Field( - default=None, - description="Deprecated; please pass `use_activation` instead", - ) # --8<-- [end:embed-extra-params] diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py index 3c4bbd8c2..bfc38ebef 100644 --- a/vllm/entrypoints/pooling/classify/protocol.py +++ b/vllm/entrypoints/pooling/classify/protocol.py @@ -40,7 +40,6 @@ class ClassificationCompletionRequest( def to_pooling_params(self): return PoolingParams( task="classify", - truncate_prompt_tokens=self.truncate_prompt_tokens, use_activation=self.use_activation, ) @@ -63,7 +62,6 @@ class ClassificationChatRequest( def to_pooling_params(self): return PoolingParams( task="classify", - truncate_prompt_tokens=self.truncate_prompt_tokens, use_activation=self.use_activation, ) diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py index 4f83105f2..4b47c6522 100644 --- a/vllm/entrypoints/pooling/embed/protocol.py +++ b/vllm/entrypoints/pooling/embed/protocol.py @@ -14,12 +14,9 @@ from vllm.entrypoints.pooling.base.protocol import ( EmbedRequestMixin, PoolingBasicRequestMixin, ) -from vllm.logger import init_logger from vllm.renderers import TokenizeParams from vllm.utils import random_uuid -logger = init_logger(__name__) - def _get_max_total_output_tokens( model_config: ModelConfig, @@ -60,18 +57,10 @@ class EmbeddingCompletionRequest( ) def to_pooling_params(self): - if self.normalize is not None: - logger.warning_once( - "`normalize` is deprecated and will be removed in v0.17. " - "Please pass `use_activation` instead." - ) - self.use_activation = self.normalize - return PoolingParams( task="embed", dimensions=self.dimensions, use_activation=self.use_activation, - truncate_prompt_tokens=self.truncate_prompt_tokens, ) @@ -97,18 +86,10 @@ class EmbeddingChatRequest( ) def to_pooling_params(self): - if self.normalize is not None: - logger.warning_once( - "`normalize` is deprecated and will be removed in v0.17. " - "Please pass `use_activation` instead." - ) - self.use_activation = self.normalize - return PoolingParams( task="embed", dimensions=self.dimensions, use_activation=self.use_activation, - truncate_prompt_tokens=self.truncate_prompt_tokens, ) diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index a8c1c59ff..b99f98959 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -16,13 +16,10 @@ from vllm.entrypoints.pooling.base.protocol import ( EncodingRequestMixin, PoolingBasicRequestMixin, ) -from vllm.logger import init_logger from vllm.renderers import TokenizeParams from vllm.tasks import PoolingTask from vllm.utils import random_uuid -logger = init_logger(__name__) - class PoolingCompletionRequest( PoolingBasicRequestMixin, @@ -45,16 +42,8 @@ class PoolingCompletionRequest( ) def to_pooling_params(self): - if self.normalize is not None: - logger.warning_once( - "`normalize` is deprecated and will be removed in v0.17. " - "Please pass `use_activation` instead." - ) - self.use_activation = self.normalize - return PoolingParams( task=self.task, - truncate_prompt_tokens=self.truncate_prompt_tokens, use_activation=self.use_activation, dimensions=self.dimensions, ) @@ -78,16 +67,8 @@ class PoolingChatRequest( ) def to_pooling_params(self): - if self.normalize is not None: - logger.warning_once( - "`normalize` is deprecated and will be removed in v0.17. " - "Please pass `use_activation` instead." - ) - self.use_activation = self.normalize - return PoolingParams( task=self.task, - truncate_prompt_tokens=self.truncate_prompt_tokens, use_activation=self.use_activation, dimensions=self.dimensions, ) diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index a85ed5d70..643eeed36 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -37,7 +37,6 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): def to_pooling_params(self, task: PoolingTask = "score"): return PoolingParams( task=task, - truncate_prompt_tokens=self.truncate_prompt_tokens, use_activation=self.use_activation, ) @@ -113,7 +112,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin): def to_pooling_params(self, task: PoolingTask = "score"): return PoolingParams( task=task, - truncate_prompt_tokens=self.truncate_prompt_tokens, use_activation=self.use_activation, ) diff --git a/vllm/model_executor/layers/mamba/mamba_utils.py b/vllm/model_executor/layers/mamba/mamba_utils.py index fc8912f8c..1f6751f6c 100644 --- a/vllm/model_executor/layers/mamba/mamba_utils.py +++ b/vllm/model_executor/layers/mamba/mamba_utils.py @@ -289,9 +289,6 @@ def get_temporal_copy_spec( ) -get_full_copy_spec = get_temporal_copy_spec - - class MambaStateCopyFuncCalculator: @classmethod def linear_attention_state_copy_func(cls): diff --git a/vllm/model_executor/models/ovis2_5.py b/vllm/model_executor/models/ovis2_5.py index 2d9385c57..57559ba99 100644 --- a/vllm/model_executor/models/ovis2_5.py +++ b/vllm/model_executor/models/ovis2_5.py @@ -43,12 +43,9 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP IMAGE_TOKEN = "" -IMAGE_PLACEHOLDER_ID = 151669 VIDEO_TOKEN = "