[Deprecation] Deprecate code in 0.17 as scheduled (#35441)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
Wentao Ye
2026-02-28 12:32:37 -05:00
committed by GitHub
parent 1dafb29f91
commit e113a30113
22 changed files with 31 additions and 250 deletions

View File

@@ -683,13 +683,13 @@ async def test_params_not_supported(
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
async def get_outputs(normalize):
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
async def get_outputs(use_activation):
request_args = {
"model": MODEL_NAME,
"input": input_text,
"encoding_format": "float",
"normalize": normalize,
"use_activation": use_activation,
}
response = requests.post(server.url_for("v1/embeddings"), json=request_args)
@@ -697,9 +697,9 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
return torch.tensor([x["embedding"] for x in outputs["data"]])
default = await get_outputs(normalize=None)
w_normal = await get_outputs(normalize=True)
wo_normal = await get_outputs(normalize=False)
default = await get_outputs(use_activation=None)
w_normal = await get_outputs(use_activation=True)
wo_normal = await get_outputs(use_activation=False)
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (

View File

@@ -101,11 +101,15 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
sampling_params = self._sampling_params_from_proto(
request.sampling_params, stream=request.stream
)
tokenization_kwargs = self._tokenization_kwargs_from_proto(
request.sampling_params
)
async for output in self.async_llm.generate(
prompt=prompt,
sampling_params=sampling_params,
request_id=request_id,
tokenization_kwargs=tokenization_kwargs,
):
# Convert vLLM output to protobuf
# For streaming, always send chunks
@@ -308,9 +312,6 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
seed=params.seed if params.HasField("seed") else None,
include_stop_str_in_output=params.include_stop_str_in_output,
logit_bias=dict(params.logit_bias) if params.logit_bias else None,
truncate_prompt_tokens=params.truncate_prompt_tokens
if params.HasField("truncate_prompt_tokens")
else None,
structured_outputs=structured_outputs,
# detokenize must be True if stop strings are used
detokenize=bool(stop),
@@ -319,6 +320,14 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
else RequestOutputKind.FINAL_ONLY,
)
@staticmethod
def _tokenization_kwargs_from_proto(
params: vllm_engine_pb2.SamplingParams,
) -> dict[str, int] | None:
if params.HasField("truncate_prompt_tokens"):
return {"truncate_prompt_tokens": params.truncate_prompt_tokens}
return None
@staticmethod
def _chunk_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse:
"""

View File

@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import itertools
import warnings
from collections.abc import Callable, Iterable, Sequence
from typing import TYPE_CHECKING, Any
@@ -1030,7 +1029,6 @@ class LLM:
prompts: PromptType | Sequence[PromptType] | DataPrompt,
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
*,
truncate_prompt_tokens: int | None = None,
use_tqdm: bool | Callable[..., tqdm] = True,
lora_request: list[LoRARequest] | LoRARequest | None = None,
pooling_task: PoolingTask | None = None,
@@ -1088,20 +1086,6 @@ class LLM:
"pooling model."
)
if truncate_prompt_tokens is not None:
warnings.warn(
"The `truncate_prompt_tokens` parameter in `LLM.encode()` "
"is deprecated and will be removed in v0.16. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=truncate_prompt_tokens),
)
if use_io_processor := (isinstance(prompts, dict) and "data" in prompts):
if self.io_processor is None:
raise ValueError(
@@ -1185,7 +1169,6 @@ class LLM:
self,
prompts: PromptType | Sequence[PromptType],
*,
truncate_prompt_tokens: int | None = None,
use_tqdm: bool | Callable[..., tqdm] = True,
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
lora_request: list[LoRARequest] | LoRARequest | None = None,
@@ -1221,12 +1204,6 @@ class LLM:
"Try converting the model using `--convert embed`."
)
if truncate_prompt_tokens is not None:
tokenization_kwargs = merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=truncate_prompt_tokens),
)
items = self.encode(
prompts,
use_tqdm=use_tqdm,
@@ -1294,7 +1271,6 @@ class LLM:
/,
*,
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
truncate_prompt_tokens: int | None = None,
use_tqdm: bool | Callable[..., tqdm] = True,
lora_request: list[LoRARequest] | LoRARequest | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
@@ -1319,13 +1295,11 @@ class LLM:
A list of `PoolingRequestOutput` objects containing the
pooled hidden states in the same order as the input prompts.
"""
return self.encode(
prompts,
use_tqdm=use_tqdm,
lora_request=lora_request,
pooling_params=pooling_params,
truncate_prompt_tokens=truncate_prompt_tokens,
pooling_task="token_classify",
tokenization_kwargs=tokenization_kwargs,
)
@@ -1771,23 +1745,15 @@ class LLM:
seq_prompts = prompt_to_seq(prompts)
seq_params = self._params_to_seq(params, len(seq_prompts))
seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_prompts))
seq_tok_kwargs = [
merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
)
for param in seq_params
]
seq_priority = self._priority_to_seq(priority, len(prompts))
return self._render_and_add_requests(
prompts=(
self._preprocess_cmpl_one(prompt, tok_kwargs)
for prompt, tok_kwargs in zip(
maybe_tqdm(
seq_prompts, use_tqdm=use_tqdm, desc="Rendering prompts"
),
seq_tok_kwargs,
self._preprocess_cmpl_one(prompt, tokenization_kwargs)
for prompt in maybe_tqdm(
seq_prompts,
use_tqdm=use_tqdm,
desc="Rendering prompts",
)
),
params=seq_params,
@@ -1841,13 +1807,6 @@ class LLM:
seq_convs = conversation_to_seq(messages)
seq_params = self._params_to_seq(params, len(seq_convs))
seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_convs))
seq_tok_kwargs = [
merge_kwargs(
tokenization_kwargs,
dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
)
for param in seq_params
]
return self._render_and_run_requests(
prompts=(
@@ -1859,16 +1818,13 @@ class LLM:
add_generation_prompt=add_generation_prompt,
continue_final_message=continue_final_message,
tools=tools,
tokenization_kwargs=tok_kwargs,
tokenization_kwargs=tokenization_kwargs,
mm_processor_kwargs=mm_processor_kwargs,
)
for conversation, tok_kwargs in zip(
maybe_tqdm(
seq_convs,
use_tqdm=use_tqdm,
desc="Rendering conversations",
),
seq_tok_kwargs,
for conversation in maybe_tqdm(
seq_convs,
use_tqdm=use_tqdm,
desc="Rendering conversations",
)
),
params=seq_params,

View File

@@ -490,7 +490,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
include_stop_str_in_output=self.include_stop_str_in_output,
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
if self.stream
else RequestOutputKind.FINAL_ONLY,

View File

@@ -302,7 +302,6 @@ class CompletionRequest(OpenAIBaseModel):
skip_special_tokens=self.skip_special_tokens,
spaces_between_special_tokens=self.spaces_between_special_tokens,
include_stop_str_in_output=self.include_stop_str_in_output,
truncate_prompt_tokens=self.truncate_prompt_tokens,
output_kind=RequestOutputKind.DELTA
if self.stream
else RequestOutputKind.FINAL_ONLY,

View File

@@ -1,12 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"The 'vllm.entrypoints.openai.translations' module has been renamed to "
"'vllm.entrypoints.openai.speech_to_text'. Please update your imports. "
"This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)

View File

@@ -1,14 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.api_router' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.api_router'. Please update your "
"imports. This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.api_router import * # noqa: F401,F403,E402

View File

@@ -1,14 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.protocol' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.protocol'. Please update your "
"imports. This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.protocol import * # noqa: F401,F403,E402

View File

@@ -1,14 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.serving' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.serving'. Please update your "
"imports. This backward-compatible alias will be removed in version 0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.serving import * # noqa: F401,F403,E402

View File

@@ -1,15 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import warnings
warnings.warn(
"'vllm.entrypoints.openai.translations.speech_to_text' has been moved to "
"'vllm.entrypoints.openai.speech_to_text.speech_to_text'. Please update "
"your imports. This backward-compatible alias will be removed in version "
"0.17+.",
DeprecationWarning,
stacklevel=2,
)
from vllm.entrypoints.openai.speech_to_text.speech_to_text import * # noqa: F401,F403,E402

View File

@@ -190,10 +190,6 @@ class EmbedRequestMixin(EncodingRequestMixin):
description="Whether to use activation for the pooler outputs. "
"`None` uses the pooler's default, which is `True` in most cases.",
)
normalize: bool | None = Field(
default=None,
description="Deprecated; please pass `use_activation` instead",
)
# --8<-- [end:embed-extra-params]

View File

@@ -40,7 +40,6 @@ class ClassificationCompletionRequest(
def to_pooling_params(self):
return PoolingParams(
task="classify",
truncate_prompt_tokens=self.truncate_prompt_tokens,
use_activation=self.use_activation,
)
@@ -63,7 +62,6 @@ class ClassificationChatRequest(
def to_pooling_params(self):
return PoolingParams(
task="classify",
truncate_prompt_tokens=self.truncate_prompt_tokens,
use_activation=self.use_activation,
)

View File

@@ -14,12 +14,9 @@ from vllm.entrypoints.pooling.base.protocol import (
EmbedRequestMixin,
PoolingBasicRequestMixin,
)
from vllm.logger import init_logger
from vllm.renderers import TokenizeParams
from vllm.utils import random_uuid
logger = init_logger(__name__)
def _get_max_total_output_tokens(
model_config: ModelConfig,
@@ -60,18 +57,10 @@ class EmbeddingCompletionRequest(
)
def to_pooling_params(self):
if self.normalize is not None:
logger.warning_once(
"`normalize` is deprecated and will be removed in v0.17. "
"Please pass `use_activation` instead."
)
self.use_activation = self.normalize
return PoolingParams(
task="embed",
dimensions=self.dimensions,
use_activation=self.use_activation,
truncate_prompt_tokens=self.truncate_prompt_tokens,
)
@@ -97,18 +86,10 @@ class EmbeddingChatRequest(
)
def to_pooling_params(self):
if self.normalize is not None:
logger.warning_once(
"`normalize` is deprecated and will be removed in v0.17. "
"Please pass `use_activation` instead."
)
self.use_activation = self.normalize
return PoolingParams(
task="embed",
dimensions=self.dimensions,
use_activation=self.use_activation,
truncate_prompt_tokens=self.truncate_prompt_tokens,
)

View File

@@ -16,13 +16,10 @@ from vllm.entrypoints.pooling.base.protocol import (
EncodingRequestMixin,
PoolingBasicRequestMixin,
)
from vllm.logger import init_logger
from vllm.renderers import TokenizeParams
from vllm.tasks import PoolingTask
from vllm.utils import random_uuid
logger = init_logger(__name__)
class PoolingCompletionRequest(
PoolingBasicRequestMixin,
@@ -45,16 +42,8 @@ class PoolingCompletionRequest(
)
def to_pooling_params(self):
if self.normalize is not None:
logger.warning_once(
"`normalize` is deprecated and will be removed in v0.17. "
"Please pass `use_activation` instead."
)
self.use_activation = self.normalize
return PoolingParams(
task=self.task,
truncate_prompt_tokens=self.truncate_prompt_tokens,
use_activation=self.use_activation,
dimensions=self.dimensions,
)
@@ -78,16 +67,8 @@ class PoolingChatRequest(
)
def to_pooling_params(self):
if self.normalize is not None:
logger.warning_once(
"`normalize` is deprecated and will be removed in v0.17. "
"Please pass `use_activation` instead."
)
self.use_activation = self.normalize
return PoolingParams(
task=self.task,
truncate_prompt_tokens=self.truncate_prompt_tokens,
use_activation=self.use_activation,
dimensions=self.dimensions,
)

View File

@@ -37,7 +37,6 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
def to_pooling_params(self, task: PoolingTask = "score"):
return PoolingParams(
task=task,
truncate_prompt_tokens=self.truncate_prompt_tokens,
use_activation=self.use_activation,
)
@@ -113,7 +112,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
def to_pooling_params(self, task: PoolingTask = "score"):
return PoolingParams(
task=task,
truncate_prompt_tokens=self.truncate_prompt_tokens,
use_activation=self.use_activation,
)

View File

@@ -289,9 +289,6 @@ def get_temporal_copy_spec(
)
get_full_copy_spec = get_temporal_copy_spec
class MambaStateCopyFuncCalculator:
@classmethod
def linear_attention_state_copy_func(cls):

View File

@@ -43,12 +43,9 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
IMAGE_TOKEN = "<image>"
IMAGE_PLACEHOLDER_ID = 151669
VIDEO_TOKEN = "<video>"
VIDEO_PLACEHOLDER_ID = 151670
INDICATOR_IDS = [151672, 151673, 151674, 151675]
IMAGE_PAD_TOKEN_ID = 151655
THINK_END_TOKEN_ID = 151668
class Ovis2_5ImagePatchInputs(TensorSchema):

View File

@@ -17,7 +17,7 @@ from typing import (
import regex as re
import torch
from typing_extensions import TypeVar, assert_never, deprecated
from typing_extensions import TypeVar, assert_never
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
@@ -996,16 +996,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
self.data_parser = self.info.get_data_parser()
@property
@deprecated("Will be removed in v0.17. Use `info.supported_mm_limits` instead.")
def supported_mm_limits(self):
return self.info.supported_mm_limits
@property
@deprecated("Will be removed in v0.17. Use `info.allowed_mm_limits` instead.")
def allowed_mm_limits(self):
return self.info.allowed_mm_limits
def __call__(
self,
prompt: str,

View File

@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import mimetypes
import warnings
from collections import defaultdict
from collections.abc import Generator, Sequence
from itertools import groupby
@@ -30,23 +29,6 @@ else:
torch = LazyLoader("torch", globals(), "torch")
def __getattr__(name: str):
if name == "MEDIA_CONNECTOR_REGISTRY":
from .media import MEDIA_CONNECTOR_REGISTRY
warnings.warn(
"`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` "
"has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. "
"The old name will be removed in v0.17.",
DeprecationWarning,
stacklevel=2,
)
return MEDIA_CONNECTOR_REGISTRY
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def encode_audio_base64(
audio: np.ndarray,
sampling_rate: int,

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from copy import deepcopy
from typing import Annotated, Any
from typing import Any
import msgspec
@@ -19,10 +19,6 @@ class PoolingParams(
"""API parameters for pooling models.
Attributes:
truncate_prompt_tokens: Controls prompt truncation.
Set to -1 to use the model's default truncation size.
Set to k to keep only the last k tokens (left truncation).
Set to None to disable truncation.
use_activation: Whether to apply activation function to the pooler outputs.
`None` uses the pooler's default, which is `True` in most cases.
dimensions: Reduce the dimensions of embeddings
@@ -30,7 +26,6 @@ class PoolingParams(
"""
# --8<-- [start:common-pooling-params]
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
use_activation: bool | None = None
# --8<-- [end:common-pooling-params]
@@ -198,7 +193,6 @@ class PoolingParams(
f"returned_token_ids={self.returned_token_ids}, "
f"requires_token_ids={self.requires_token_ids}, "
f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
f"extra_kwargs={self.extra_kwargs})"
)

View File

@@ -7,7 +7,7 @@ import json as json_mod
from dataclasses import field
from enum import Enum, IntEnum
from functools import cached_property
from typing import Annotated, Any
from typing import Any
import msgspec
from pydantic.dataclasses import dataclass
@@ -209,10 +209,6 @@ class SamplingParams(
"""Whether to add spaces between special tokens in the output."""
include_stop_str_in_output: bool = False
"""Whether to include the stop strings in output text."""
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
"""If set to -1, will use the truncation size supported by the model. If
set to an integer k, will use only the last k tokens from the prompt
(i.e., left truncation). If set to `None`, truncation is disabled."""
output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
skip_clone: bool = False
"""Internal flag indicating that this SamplingParams instance is safe to
@@ -273,7 +269,6 @@ class SamplingParams(
detokenize: bool = True,
skip_special_tokens: bool = True,
spaces_between_special_tokens: bool = True,
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
structured_outputs: StructuredOutputsParams | None = None,
logit_bias: dict[int, float] | dict[str, float] | None = None,
@@ -313,7 +308,6 @@ class SamplingParams(
detokenize=detokenize,
skip_special_tokens=skip_special_tokens,
spaces_between_special_tokens=spaces_between_special_tokens,
truncate_prompt_tokens=truncate_prompt_tokens,
output_kind=output_kind,
structured_outputs=structured_outputs,
logit_bias=logit_bias,
@@ -449,15 +443,6 @@ class SamplingParams(
parameter="prompt_logprobs",
value=self.prompt_logprobs,
)
if self.truncate_prompt_tokens is not None and (
self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1
):
raise VLLMValidationError(
f"truncate_prompt_tokens must be an integer >= 1 or -1, "
f"got {self.truncate_prompt_tokens}",
parameter="truncate_prompt_tokens",
value=self.truncate_prompt_tokens,
)
assert isinstance(self.stop_token_ids, list)
if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
raise ValueError(
@@ -835,7 +820,6 @@ class SamplingParams(
f"skip_special_tokens={self.skip_special_tokens}, "
"spaces_between_special_tokens="
f"{self.spaces_between_special_tokens}, "
f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
f"structured_outputs={self.structured_outputs}, "
f"extra_args={self.extra_args})"
)

View File

@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
import warnings
from collections.abc import Mapping
from typing import Any, Literal
@@ -114,16 +113,6 @@ class InputProcessor:
supported_tasks: tuple[SupportedTask, ...],
) -> None:
"""Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
if params.truncate_prompt_tokens is not None:
params_type = type(params).__name__
warnings.warn(
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
"is deprecated and will be removed in v0.17. "
"Please pass it via `tokenization_kwargs` instead.",
DeprecationWarning,
stacklevel=2,
)
if isinstance(params, SamplingParams):
supported_generation_tasks = [
task for task in supported_tasks if task in GENERATION_TASKS