[Deprecation] Deprecate code in 0.17 as scheduled (#35441)
Signed-off-by: yewentao256 <zhyanwentao@126.com> Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -683,13 +683,13 @@ async def test_params_not_supported(
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
async def test_normalize(server: RemoteOpenAIServer, model_name: str):
|
||||
async def get_outputs(normalize):
|
||||
async def test_use_activation(server: RemoteOpenAIServer, model_name: str):
|
||||
async def get_outputs(use_activation):
|
||||
request_args = {
|
||||
"model": MODEL_NAME,
|
||||
"input": input_text,
|
||||
"encoding_format": "float",
|
||||
"normalize": normalize,
|
||||
"use_activation": use_activation,
|
||||
}
|
||||
|
||||
response = requests.post(server.url_for("v1/embeddings"), json=request_args)
|
||||
@@ -697,9 +697,9 @@ async def test_normalize(server: RemoteOpenAIServer, model_name: str):
|
||||
|
||||
return torch.tensor([x["embedding"] for x in outputs["data"]])
|
||||
|
||||
default = await get_outputs(normalize=None)
|
||||
w_normal = await get_outputs(normalize=True)
|
||||
wo_normal = await get_outputs(normalize=False)
|
||||
default = await get_outputs(use_activation=None)
|
||||
w_normal = await get_outputs(use_activation=True)
|
||||
wo_normal = await get_outputs(use_activation=False)
|
||||
|
||||
assert torch.allclose(default, w_normal, atol=1e-2), "Default should use normal."
|
||||
assert not torch.allclose(w_normal, wo_normal, atol=1e-2), (
|
||||
|
||||
@@ -101,11 +101,15 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
|
||||
sampling_params = self._sampling_params_from_proto(
|
||||
request.sampling_params, stream=request.stream
|
||||
)
|
||||
tokenization_kwargs = self._tokenization_kwargs_from_proto(
|
||||
request.sampling_params
|
||||
)
|
||||
|
||||
async for output in self.async_llm.generate(
|
||||
prompt=prompt,
|
||||
sampling_params=sampling_params,
|
||||
request_id=request_id,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
):
|
||||
# Convert vLLM output to protobuf
|
||||
# For streaming, always send chunks
|
||||
@@ -308,9 +312,6 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
|
||||
seed=params.seed if params.HasField("seed") else None,
|
||||
include_stop_str_in_output=params.include_stop_str_in_output,
|
||||
logit_bias=dict(params.logit_bias) if params.logit_bias else None,
|
||||
truncate_prompt_tokens=params.truncate_prompt_tokens
|
||||
if params.HasField("truncate_prompt_tokens")
|
||||
else None,
|
||||
structured_outputs=structured_outputs,
|
||||
# detokenize must be True if stop strings are used
|
||||
detokenize=bool(stop),
|
||||
@@ -319,6 +320,14 @@ class VllmEngineServicer(vllm_engine_pb2_grpc.VllmEngineServicer):
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _tokenization_kwargs_from_proto(
|
||||
params: vllm_engine_pb2.SamplingParams,
|
||||
) -> dict[str, int] | None:
|
||||
if params.HasField("truncate_prompt_tokens"):
|
||||
return {"truncate_prompt_tokens": params.truncate_prompt_tokens}
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _chunk_response(output: RequestOutput) -> vllm_engine_pb2.GenerateResponse:
|
||||
"""
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
import warnings
|
||||
from collections.abc import Callable, Iterable, Sequence
|
||||
from typing import TYPE_CHECKING, Any
|
||||
|
||||
@@ -1030,7 +1029,6 @@ class LLM:
|
||||
prompts: PromptType | Sequence[PromptType] | DataPrompt,
|
||||
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
|
||||
*,
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
pooling_task: PoolingTask | None = None,
|
||||
@@ -1088,20 +1086,6 @@ class LLM:
|
||||
"pooling model."
|
||||
)
|
||||
|
||||
if truncate_prompt_tokens is not None:
|
||||
warnings.warn(
|
||||
"The `truncate_prompt_tokens` parameter in `LLM.encode()` "
|
||||
"is deprecated and will be removed in v0.16. "
|
||||
"Please pass it via `tokenization_kwargs` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
tokenization_kwargs = merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=truncate_prompt_tokens),
|
||||
)
|
||||
|
||||
if use_io_processor := (isinstance(prompts, dict) and "data" in prompts):
|
||||
if self.io_processor is None:
|
||||
raise ValueError(
|
||||
@@ -1185,7 +1169,6 @@ class LLM:
|
||||
self,
|
||||
prompts: PromptType | Sequence[PromptType],
|
||||
*,
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
@@ -1221,12 +1204,6 @@ class LLM:
|
||||
"Try converting the model using `--convert embed`."
|
||||
)
|
||||
|
||||
if truncate_prompt_tokens is not None:
|
||||
tokenization_kwargs = merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=truncate_prompt_tokens),
|
||||
)
|
||||
|
||||
items = self.encode(
|
||||
prompts,
|
||||
use_tqdm=use_tqdm,
|
||||
@@ -1294,7 +1271,6 @@ class LLM:
|
||||
/,
|
||||
*,
|
||||
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
@@ -1319,13 +1295,11 @@ class LLM:
|
||||
A list of `PoolingRequestOutput` objects containing the
|
||||
pooled hidden states in the same order as the input prompts.
|
||||
"""
|
||||
|
||||
return self.encode(
|
||||
prompts,
|
||||
use_tqdm=use_tqdm,
|
||||
lora_request=lora_request,
|
||||
pooling_params=pooling_params,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
pooling_task="token_classify",
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
@@ -1771,23 +1745,15 @@ class LLM:
|
||||
seq_prompts = prompt_to_seq(prompts)
|
||||
seq_params = self._params_to_seq(params, len(seq_prompts))
|
||||
seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_prompts))
|
||||
seq_tok_kwargs = [
|
||||
merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
|
||||
)
|
||||
for param in seq_params
|
||||
]
|
||||
seq_priority = self._priority_to_seq(priority, len(prompts))
|
||||
|
||||
return self._render_and_add_requests(
|
||||
prompts=(
|
||||
self._preprocess_cmpl_one(prompt, tok_kwargs)
|
||||
for prompt, tok_kwargs in zip(
|
||||
maybe_tqdm(
|
||||
seq_prompts, use_tqdm=use_tqdm, desc="Rendering prompts"
|
||||
),
|
||||
seq_tok_kwargs,
|
||||
self._preprocess_cmpl_one(prompt, tokenization_kwargs)
|
||||
for prompt in maybe_tqdm(
|
||||
seq_prompts,
|
||||
use_tqdm=use_tqdm,
|
||||
desc="Rendering prompts",
|
||||
)
|
||||
),
|
||||
params=seq_params,
|
||||
@@ -1841,13 +1807,6 @@ class LLM:
|
||||
seq_convs = conversation_to_seq(messages)
|
||||
seq_params = self._params_to_seq(params, len(seq_convs))
|
||||
seq_lora_requests = self._lora_request_to_seq(lora_request, len(seq_convs))
|
||||
seq_tok_kwargs = [
|
||||
merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
|
||||
)
|
||||
for param in seq_params
|
||||
]
|
||||
|
||||
return self._render_and_run_requests(
|
||||
prompts=(
|
||||
@@ -1859,16 +1818,13 @@ class LLM:
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
continue_final_message=continue_final_message,
|
||||
tools=tools,
|
||||
tokenization_kwargs=tok_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
for conversation, tok_kwargs in zip(
|
||||
maybe_tqdm(
|
||||
seq_convs,
|
||||
use_tqdm=use_tqdm,
|
||||
desc="Rendering conversations",
|
||||
),
|
||||
seq_tok_kwargs,
|
||||
for conversation in maybe_tqdm(
|
||||
seq_convs,
|
||||
use_tqdm=use_tqdm,
|
||||
desc="Rendering conversations",
|
||||
)
|
||||
),
|
||||
params=seq_params,
|
||||
|
||||
@@ -490,7 +490,6 @@ class ChatCompletionRequest(OpenAIBaseModel):
|
||||
skip_special_tokens=self.skip_special_tokens,
|
||||
spaces_between_special_tokens=self.spaces_between_special_tokens,
|
||||
include_stop_str_in_output=self.include_stop_str_in_output,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
output_kind=RequestOutputKind.DELTA
|
||||
if self.stream
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
|
||||
@@ -302,7 +302,6 @@ class CompletionRequest(OpenAIBaseModel):
|
||||
skip_special_tokens=self.skip_special_tokens,
|
||||
spaces_between_special_tokens=self.spaces_between_special_tokens,
|
||||
include_stop_str_in_output=self.include_stop_str_in_output,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
output_kind=RequestOutputKind.DELTA
|
||||
if self.stream
|
||||
else RequestOutputKind.FINAL_ONLY,
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"The 'vllm.entrypoints.openai.translations' module has been renamed to "
|
||||
"'vllm.entrypoints.openai.speech_to_text'. Please update your imports. "
|
||||
"This backward-compatible alias will be removed in version 0.17+.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
@@ -1,14 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"'vllm.entrypoints.openai.translations.api_router' has been moved to "
|
||||
"'vllm.entrypoints.openai.speech_to_text.api_router'. Please update your "
|
||||
"imports. This backward-compatible alias will be removed in version 0.17+.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.speech_to_text.api_router import * # noqa: F401,F403,E402
|
||||
@@ -1,14 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"'vllm.entrypoints.openai.translations.protocol' has been moved to "
|
||||
"'vllm.entrypoints.openai.speech_to_text.protocol'. Please update your "
|
||||
"imports. This backward-compatible alias will be removed in version 0.17+.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.speech_to_text.protocol import * # noqa: F401,F403,E402
|
||||
@@ -1,14 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"'vllm.entrypoints.openai.translations.serving' has been moved to "
|
||||
"'vllm.entrypoints.openai.speech_to_text.serving'. Please update your "
|
||||
"imports. This backward-compatible alias will be removed in version 0.17+.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.speech_to_text.serving import * # noqa: F401,F403,E402
|
||||
@@ -1,15 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"'vllm.entrypoints.openai.translations.speech_to_text' has been moved to "
|
||||
"'vllm.entrypoints.openai.speech_to_text.speech_to_text'. Please update "
|
||||
"your imports. This backward-compatible alias will be removed in version "
|
||||
"0.17+.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
from vllm.entrypoints.openai.speech_to_text.speech_to_text import * # noqa: F401,F403,E402
|
||||
@@ -190,10 +190,6 @@ class EmbedRequestMixin(EncodingRequestMixin):
|
||||
description="Whether to use activation for the pooler outputs. "
|
||||
"`None` uses the pooler's default, which is `True` in most cases.",
|
||||
)
|
||||
normalize: bool | None = Field(
|
||||
default=None,
|
||||
description="Deprecated; please pass `use_activation` instead",
|
||||
)
|
||||
# --8<-- [end:embed-extra-params]
|
||||
|
||||
|
||||
|
||||
@@ -40,7 +40,6 @@ class ClassificationCompletionRequest(
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
task="classify",
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
)
|
||||
|
||||
@@ -63,7 +62,6 @@ class ClassificationChatRequest(
|
||||
def to_pooling_params(self):
|
||||
return PoolingParams(
|
||||
task="classify",
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
)
|
||||
|
||||
|
||||
@@ -14,12 +14,9 @@ from vllm.entrypoints.pooling.base.protocol import (
|
||||
EmbedRequestMixin,
|
||||
PoolingBasicRequestMixin,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.renderers import TokenizeParams
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def _get_max_total_output_tokens(
|
||||
model_config: ModelConfig,
|
||||
@@ -60,18 +57,10 @@ class EmbeddingCompletionRequest(
|
||||
)
|
||||
|
||||
def to_pooling_params(self):
|
||||
if self.normalize is not None:
|
||||
logger.warning_once(
|
||||
"`normalize` is deprecated and will be removed in v0.17. "
|
||||
"Please pass `use_activation` instead."
|
||||
)
|
||||
self.use_activation = self.normalize
|
||||
|
||||
return PoolingParams(
|
||||
task="embed",
|
||||
dimensions=self.dimensions,
|
||||
use_activation=self.use_activation,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
)
|
||||
|
||||
|
||||
@@ -97,18 +86,10 @@ class EmbeddingChatRequest(
|
||||
)
|
||||
|
||||
def to_pooling_params(self):
|
||||
if self.normalize is not None:
|
||||
logger.warning_once(
|
||||
"`normalize` is deprecated and will be removed in v0.17. "
|
||||
"Please pass `use_activation` instead."
|
||||
)
|
||||
self.use_activation = self.normalize
|
||||
|
||||
return PoolingParams(
|
||||
task="embed",
|
||||
dimensions=self.dimensions,
|
||||
use_activation=self.use_activation,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -16,13 +16,10 @@ from vllm.entrypoints.pooling.base.protocol import (
|
||||
EncodingRequestMixin,
|
||||
PoolingBasicRequestMixin,
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.renderers import TokenizeParams
|
||||
from vllm.tasks import PoolingTask
|
||||
from vllm.utils import random_uuid
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class PoolingCompletionRequest(
|
||||
PoolingBasicRequestMixin,
|
||||
@@ -45,16 +42,8 @@ class PoolingCompletionRequest(
|
||||
)
|
||||
|
||||
def to_pooling_params(self):
|
||||
if self.normalize is not None:
|
||||
logger.warning_once(
|
||||
"`normalize` is deprecated and will be removed in v0.17. "
|
||||
"Please pass `use_activation` instead."
|
||||
)
|
||||
self.use_activation = self.normalize
|
||||
|
||||
return PoolingParams(
|
||||
task=self.task,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
dimensions=self.dimensions,
|
||||
)
|
||||
@@ -78,16 +67,8 @@ class PoolingChatRequest(
|
||||
)
|
||||
|
||||
def to_pooling_params(self):
|
||||
if self.normalize is not None:
|
||||
logger.warning_once(
|
||||
"`normalize` is deprecated and will be removed in v0.17. "
|
||||
"Please pass `use_activation` instead."
|
||||
)
|
||||
self.use_activation = self.normalize
|
||||
|
||||
return PoolingParams(
|
||||
task=self.task,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
dimensions=self.dimensions,
|
||||
)
|
||||
|
||||
@@ -37,7 +37,6 @@ class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
def to_pooling_params(self, task: PoolingTask = "score"):
|
||||
return PoolingParams(
|
||||
task=task,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
)
|
||||
|
||||
@@ -113,7 +112,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
def to_pooling_params(self, task: PoolingTask = "score"):
|
||||
return PoolingParams(
|
||||
task=task,
|
||||
truncate_prompt_tokens=self.truncate_prompt_tokens,
|
||||
use_activation=self.use_activation,
|
||||
)
|
||||
|
||||
|
||||
@@ -289,9 +289,6 @@ def get_temporal_copy_spec(
|
||||
)
|
||||
|
||||
|
||||
get_full_copy_spec = get_temporal_copy_spec
|
||||
|
||||
|
||||
class MambaStateCopyFuncCalculator:
|
||||
@classmethod
|
||||
def linear_attention_state_copy_func(cls):
|
||||
|
||||
@@ -43,12 +43,9 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
|
||||
|
||||
IMAGE_TOKEN = "<image>"
|
||||
IMAGE_PLACEHOLDER_ID = 151669
|
||||
VIDEO_TOKEN = "<video>"
|
||||
VIDEO_PLACEHOLDER_ID = 151670
|
||||
INDICATOR_IDS = [151672, 151673, 151674, 151675]
|
||||
IMAGE_PAD_TOKEN_ID = 151655
|
||||
THINK_END_TOKEN_ID = 151668
|
||||
|
||||
|
||||
class Ovis2_5ImagePatchInputs(TensorSchema):
|
||||
|
||||
@@ -17,7 +17,7 @@ from typing import (
|
||||
|
||||
import regex as re
|
||||
import torch
|
||||
from typing_extensions import TypeVar, assert_never, deprecated
|
||||
from typing_extensions import TypeVar, assert_never
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
@@ -996,16 +996,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
|
||||
self.data_parser = self.info.get_data_parser()
|
||||
|
||||
@property
|
||||
@deprecated("Will be removed in v0.17. Use `info.supported_mm_limits` instead.")
|
||||
def supported_mm_limits(self):
|
||||
return self.info.supported_mm_limits
|
||||
|
||||
@property
|
||||
@deprecated("Will be removed in v0.17. Use `info.allowed_mm_limits` instead.")
|
||||
def allowed_mm_limits(self):
|
||||
return self.info.allowed_mm_limits
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
prompt: str,
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import mimetypes
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from collections.abc import Generator, Sequence
|
||||
from itertools import groupby
|
||||
@@ -30,23 +29,6 @@ else:
|
||||
torch = LazyLoader("torch", globals(), "torch")
|
||||
|
||||
|
||||
def __getattr__(name: str):
|
||||
if name == "MEDIA_CONNECTOR_REGISTRY":
|
||||
from .media import MEDIA_CONNECTOR_REGISTRY
|
||||
|
||||
warnings.warn(
|
||||
"`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` "
|
||||
"has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. "
|
||||
"The old name will be removed in v0.17.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
return MEDIA_CONNECTOR_REGISTRY
|
||||
|
||||
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||
|
||||
|
||||
def encode_audio_base64(
|
||||
audio: np.ndarray,
|
||||
sampling_rate: int,
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from copy import deepcopy
|
||||
from typing import Annotated, Any
|
||||
from typing import Any
|
||||
|
||||
import msgspec
|
||||
|
||||
@@ -19,10 +19,6 @@ class PoolingParams(
|
||||
"""API parameters for pooling models.
|
||||
|
||||
Attributes:
|
||||
truncate_prompt_tokens: Controls prompt truncation.
|
||||
Set to -1 to use the model's default truncation size.
|
||||
Set to k to keep only the last k tokens (left truncation).
|
||||
Set to None to disable truncation.
|
||||
use_activation: Whether to apply activation function to the pooler outputs.
|
||||
`None` uses the pooler's default, which is `True` in most cases.
|
||||
dimensions: Reduce the dimensions of embeddings
|
||||
@@ -30,7 +26,6 @@ class PoolingParams(
|
||||
"""
|
||||
|
||||
# --8<-- [start:common-pooling-params]
|
||||
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
|
||||
use_activation: bool | None = None
|
||||
# --8<-- [end:common-pooling-params]
|
||||
|
||||
@@ -198,7 +193,6 @@ class PoolingParams(
|
||||
f"returned_token_ids={self.returned_token_ids}, "
|
||||
f"requires_token_ids={self.requires_token_ids}, "
|
||||
f"skip_reading_prefix_cache={self.skip_reading_prefix_cache}, "
|
||||
f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
|
||||
f"extra_kwargs={self.extra_kwargs})"
|
||||
)
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ import json as json_mod
|
||||
from dataclasses import field
|
||||
from enum import Enum, IntEnum
|
||||
from functools import cached_property
|
||||
from typing import Annotated, Any
|
||||
from typing import Any
|
||||
|
||||
import msgspec
|
||||
from pydantic.dataclasses import dataclass
|
||||
@@ -209,10 +209,6 @@ class SamplingParams(
|
||||
"""Whether to add spaces between special tokens in the output."""
|
||||
include_stop_str_in_output: bool = False
|
||||
"""Whether to include the stop strings in output text."""
|
||||
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None
|
||||
"""If set to -1, will use the truncation size supported by the model. If
|
||||
set to an integer k, will use only the last k tokens from the prompt
|
||||
(i.e., left truncation). If set to `None`, truncation is disabled."""
|
||||
output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE
|
||||
skip_clone: bool = False
|
||||
"""Internal flag indicating that this SamplingParams instance is safe to
|
||||
@@ -273,7 +269,6 @@ class SamplingParams(
|
||||
detokenize: bool = True,
|
||||
skip_special_tokens: bool = True,
|
||||
spaces_between_special_tokens: bool = True,
|
||||
truncate_prompt_tokens: Annotated[int, msgspec.Meta(ge=-1)] | None = None,
|
||||
output_kind: RequestOutputKind = RequestOutputKind.CUMULATIVE,
|
||||
structured_outputs: StructuredOutputsParams | None = None,
|
||||
logit_bias: dict[int, float] | dict[str, float] | None = None,
|
||||
@@ -313,7 +308,6 @@ class SamplingParams(
|
||||
detokenize=detokenize,
|
||||
skip_special_tokens=skip_special_tokens,
|
||||
spaces_between_special_tokens=spaces_between_special_tokens,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
output_kind=output_kind,
|
||||
structured_outputs=structured_outputs,
|
||||
logit_bias=logit_bias,
|
||||
@@ -449,15 +443,6 @@ class SamplingParams(
|
||||
parameter="prompt_logprobs",
|
||||
value=self.prompt_logprobs,
|
||||
)
|
||||
if self.truncate_prompt_tokens is not None and (
|
||||
self.truncate_prompt_tokens == 0 or self.truncate_prompt_tokens < -1
|
||||
):
|
||||
raise VLLMValidationError(
|
||||
f"truncate_prompt_tokens must be an integer >= 1 or -1, "
|
||||
f"got {self.truncate_prompt_tokens}",
|
||||
parameter="truncate_prompt_tokens",
|
||||
value=self.truncate_prompt_tokens,
|
||||
)
|
||||
assert isinstance(self.stop_token_ids, list)
|
||||
if not all(isinstance(st_id, int) for st_id in self.stop_token_ids):
|
||||
raise ValueError(
|
||||
@@ -835,7 +820,6 @@ class SamplingParams(
|
||||
f"skip_special_tokens={self.skip_special_tokens}, "
|
||||
"spaces_between_special_tokens="
|
||||
f"{self.spaces_between_special_tokens}, "
|
||||
f"truncate_prompt_tokens={self.truncate_prompt_tokens}, "
|
||||
f"structured_outputs={self.structured_outputs}, "
|
||||
f"extra_args={self.extra_args})"
|
||||
)
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import time
|
||||
import warnings
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Literal
|
||||
|
||||
@@ -114,16 +113,6 @@ class InputProcessor:
|
||||
supported_tasks: tuple[SupportedTask, ...],
|
||||
) -> None:
|
||||
"""Raise `ValueError` if SamplingParams or PoolingParams is not valid."""
|
||||
if params.truncate_prompt_tokens is not None:
|
||||
params_type = type(params).__name__
|
||||
warnings.warn(
|
||||
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
|
||||
"is deprecated and will be removed in v0.17. "
|
||||
"Please pass it via `tokenization_kwargs` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if isinstance(params, SamplingParams):
|
||||
supported_generation_tasks = [
|
||||
task for task in supported_tasks if task in GENERATION_TASKS
|
||||
|
||||
Reference in New Issue
Block a user