[Frontend] Use new Renderer for Completions and Tokenize API (#32863)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -2,8 +2,9 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
import warnings
|
||||
from collections.abc import Callable, Sequence
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
from typing import TYPE_CHECKING, Any, TypeAlias, cast
|
||||
|
||||
import cloudpickle
|
||||
import torch.nn as nn
|
||||
@@ -46,15 +47,17 @@ from vllm.entrypoints.pooling.score.utils import (
|
||||
compress_token_type_ids,
|
||||
get_score_prompt,
|
||||
)
|
||||
from vllm.entrypoints.utils import _validate_truncation_size, log_non_default_args
|
||||
from vllm.entrypoints.utils import log_non_default_args
|
||||
from vllm.inputs import (
|
||||
DataPrompt,
|
||||
EmbedsPrompt,
|
||||
ExplicitEncoderDecoderPrompt,
|
||||
PromptType,
|
||||
SingletonPrompt,
|
||||
TextPrompt,
|
||||
TokensPrompt,
|
||||
)
|
||||
from vllm.inputs.parse import get_prompt_components
|
||||
from vllm.inputs.parse import get_prompt_components, is_explicit_encoder_decoder_prompt
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.model_executor.layers.quantization import QuantizationMethods
|
||||
@@ -67,6 +70,7 @@ from vllm.outputs import (
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.renderers import ChatParams, TokenizeParams, merge_kwargs
|
||||
from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams
|
||||
from vllm.tasks import PoolingTask
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
@@ -74,7 +78,6 @@ from vllm.tokenizers.mistral import MistralTokenizer
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils.collection_utils import as_iter, is_list_of
|
||||
from vllm.utils.counter import Counter
|
||||
from vllm.v1.engine import EngineCoreRequest
|
||||
from vllm.v1.engine.llm_engine import LLMEngine
|
||||
from vllm.v1.sample.logits_processor import LogitsProcessor
|
||||
|
||||
@@ -85,6 +88,9 @@ logger = init_logger(__name__)
|
||||
|
||||
_R = TypeVar("_R", default=Any)
|
||||
|
||||
EnginePrompt: TypeAlias = TextPrompt | TokensPrompt | EmbedsPrompt
|
||||
EngineEncDecPrompt: TypeAlias = ExplicitEncoderDecoderPrompt[EnginePrompt, EnginePrompt]
|
||||
|
||||
|
||||
class LLM:
|
||||
"""An LLM for generating texts from given prompts and sampling parameters.
|
||||
@@ -372,6 +378,7 @@ class LLM:
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
priority: list[int] | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[RequestOutput]:
|
||||
"""Generates the completions for the input prompts.
|
||||
|
||||
@@ -398,15 +405,11 @@ class LLM:
|
||||
If provided, must be a list of integers matching the length
|
||||
of `prompts`, where each priority value corresponds to the prompt
|
||||
at the same index.
|
||||
tokenization_kwargs: Overrides for `tokenizer.encode`.
|
||||
|
||||
Returns:
|
||||
A list of `RequestOutput` objects containing the
|
||||
generated completions in the same order as the input prompts.
|
||||
|
||||
Note:
|
||||
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the `inputs` parameter.
|
||||
"""
|
||||
model_config = self.model_config
|
||||
runner_type = model_config.runner_type
|
||||
@@ -418,17 +421,14 @@ class LLM:
|
||||
)
|
||||
|
||||
if sampling_params is None:
|
||||
# Use default sampling params.
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
|
||||
# Add any modality specific loras to the corresponding prompts
|
||||
lora_request = self._get_modality_specific_lora_reqs(prompts, lora_request)
|
||||
|
||||
self._validate_and_add_requests(
|
||||
prompts=prompts,
|
||||
params=sampling_params,
|
||||
use_tqdm=use_tqdm,
|
||||
lora_request=lora_request,
|
||||
lora_request=self._get_modality_specific_lora_reqs(prompts, lora_request),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
priority=priority,
|
||||
)
|
||||
|
||||
@@ -771,65 +771,169 @@ class LLM:
|
||||
|
||||
return outputs
|
||||
|
||||
def preprocess_chat(
|
||||
def _get_cmpl_tok_params(self, tokenization_kwargs: dict[str, Any] | None):
|
||||
model_config = self.model_config
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
return TokenizeParams(
|
||||
max_total_tokens=model_config.max_model_len,
|
||||
do_lower_case=encoder_config.get("do_lower_case", False),
|
||||
# For Whisper, special tokens should be provided by the user based
|
||||
# on the task and language of their request. Also needed to avoid
|
||||
# appending an EOS token to the prompt which disrupts generation.
|
||||
add_special_tokens=not model_config.is_encoder_decoder,
|
||||
).with_kwargs(tokenization_kwargs)
|
||||
|
||||
def _normalize_prompts(
|
||||
self,
|
||||
messages: list[ChatCompletionMessageParam]
|
||||
prompts: PromptType | Sequence[PromptType],
|
||||
) -> list[EnginePrompt | EngineEncDecPrompt]:
|
||||
if isinstance(prompts, str):
|
||||
prompts = TextPrompt(prompt=prompts)
|
||||
|
||||
return prompts if isinstance(prompts, Sequence) else [prompts] # type: ignore[return-value]
|
||||
|
||||
def _preprocess_cmpl_singleton(
|
||||
self,
|
||||
prompt: SingletonPrompt,
|
||||
tok_params: TokenizeParams,
|
||||
*,
|
||||
tokenize: bool,
|
||||
) -> EnginePrompt:
|
||||
renderer = self.llm_engine.renderer
|
||||
|
||||
if not isinstance(prompt, dict):
|
||||
prompt = renderer.render_completion(prompt)
|
||||
|
||||
return renderer.tokenize_prompt(prompt, tok_params) if tokenize else prompt
|
||||
|
||||
def _preprocess_cmpl_enc_dec(
|
||||
self,
|
||||
prompt: ExplicitEncoderDecoderPrompt,
|
||||
tok_params: TokenizeParams,
|
||||
) -> EngineEncDecPrompt:
|
||||
enc_prompt = prompt["encoder_prompt"]
|
||||
dec_prompt = prompt["decoder_prompt"]
|
||||
|
||||
return EngineEncDecPrompt(
|
||||
encoder_prompt=self._preprocess_cmpl_singleton(
|
||||
enc_prompt,
|
||||
tok_params,
|
||||
# TODO: Move multi-modal processor into tokenization
|
||||
tokenize=not self.model_config.is_multimodal_model,
|
||||
),
|
||||
decoder_prompt=(
|
||||
None
|
||||
if dec_prompt is None
|
||||
else self._preprocess_cmpl_singleton(
|
||||
dec_prompt,
|
||||
tok_params,
|
||||
# TODO: Move multi-modal processor into tokenization
|
||||
tokenize=not self.model_config.is_multimodal_model,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
def _preprocess_completion(
|
||||
self,
|
||||
prompts: PromptType | Sequence[PromptType],
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[EnginePrompt | EngineEncDecPrompt]:
|
||||
"""
|
||||
Convert prompt inputs from LLM APIs (other than [LLM.chat][]) into
|
||||
a format that can be passed to `_add_request`.
|
||||
|
||||
Refer to [LLM.generate][] for a complete description of the arguments.
|
||||
|
||||
Returns:
|
||||
A list of `TokensPrompts` objects containing the tokenized prompt
|
||||
after chat template interpolation, and the raw multi-modal inputs.
|
||||
"""
|
||||
tok_params = self._get_cmpl_tok_params(tokenization_kwargs)
|
||||
|
||||
engine_prompts = list[EnginePrompt | EngineEncDecPrompt]()
|
||||
for prompt in self._normalize_prompts(prompts):
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
engine_prompts.append(self._preprocess_cmpl_enc_dec(prompt, tok_params))
|
||||
else:
|
||||
# Some MM models have non-default `add_special_tokens`
|
||||
# TODO: Move multi-modal processor into tokenization
|
||||
engine_prompts.append(
|
||||
self._preprocess_cmpl_singleton(
|
||||
prompt,
|
||||
tok_params,
|
||||
tokenize=not self.model_config.is_multimodal_model,
|
||||
)
|
||||
)
|
||||
|
||||
return engine_prompts
|
||||
|
||||
def _normalize_conversations(
|
||||
self,
|
||||
conversations: list[ChatCompletionMessageParam]
|
||||
| list[list[ChatCompletionMessageParam]],
|
||||
) -> list[list[ChatCompletionMessageParam]]:
|
||||
return conversations if is_list_of(conversations, list) else [conversations] # type: ignore[list-item,return-value]
|
||||
|
||||
def _get_chat_tok_params(self, tokenization_kwargs: dict[str, Any] | None):
|
||||
model_config = self.model_config
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
return TokenizeParams(
|
||||
max_total_tokens=model_config.max_model_len,
|
||||
do_lower_case=encoder_config.get("do_lower_case", False),
|
||||
add_special_tokens=False,
|
||||
).with_kwargs(tokenization_kwargs)
|
||||
|
||||
def _preprocess_chat(
|
||||
self,
|
||||
conversations: list[ChatCompletionMessageParam]
|
||||
| list[list[ChatCompletionMessageParam]],
|
||||
chat_template: str | None = None,
|
||||
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
|
||||
chat_template_kwargs: dict[str, Any] | None = None,
|
||||
add_generation_prompt: bool = True,
|
||||
continue_final_message: bool = False,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
chat_template_kwargs: dict[str, Any] | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
mm_processor_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[TextPrompt | TokensPrompt]:
|
||||
) -> list[EnginePrompt]:
|
||||
"""
|
||||
Generate prompt for a chat conversation. The pre-processed
|
||||
prompt can then be used as input for the other LLM methods.
|
||||
Convert a list of conversations into prompts so that they can then
|
||||
be used as input for other LLM APIs.
|
||||
|
||||
Refer to [LLM.chat][] for a complete description of the arguments.
|
||||
|
||||
Refer to `chat` for a complete description of the arguments.
|
||||
Returns:
|
||||
A list of `TokensPrompts` objects containing the tokenized
|
||||
prompt after chat template interpolation, and the
|
||||
pre-processed multi-modal inputs.
|
||||
A list of `TokensPrompts` objects containing the tokenized prompt
|
||||
after chat template interpolation, and the raw multi-modal inputs.
|
||||
"""
|
||||
list_of_messages: list[list[ChatCompletionMessageParam]]
|
||||
|
||||
# Handle multi and single conversations
|
||||
if is_list_of(messages, list):
|
||||
# messages is list[list[...]]
|
||||
list_of_messages = cast(list[list[ChatCompletionMessageParam]], messages)
|
||||
else:
|
||||
# messages is list[...]
|
||||
list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
|
||||
|
||||
renderer = self.llm_engine.renderer
|
||||
|
||||
chat_template_kwargs = {
|
||||
"chat_template": chat_template,
|
||||
"add_generation_prompt": add_generation_prompt,
|
||||
"continue_final_message": continue_final_message,
|
||||
"tools": tools,
|
||||
**(chat_template_kwargs or {}),
|
||||
}
|
||||
chat_params = ChatParams(
|
||||
chat_template=chat_template,
|
||||
chat_template_content_format=chat_template_content_format,
|
||||
chat_template_kwargs=merge_kwargs(
|
||||
chat_template_kwargs,
|
||||
dict(
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
continue_final_message=continue_final_message,
|
||||
tools=tools,
|
||||
tokenize=isinstance(renderer.tokenizer, MistralTokenizer),
|
||||
),
|
||||
),
|
||||
)
|
||||
tok_params = self._get_chat_tok_params(tokenization_kwargs)
|
||||
|
||||
prompts = list[TextPrompt | TokensPrompt]()
|
||||
|
||||
for msgs in list_of_messages:
|
||||
# NOTE: renderer.render_messages() currently doesn't
|
||||
# handle mm_processor_kwargs, since there is no implementation in
|
||||
# the chat message parsing for it.
|
||||
_, prompt = renderer.render_messages(
|
||||
msgs,
|
||||
chat_template_content_format=chat_template_content_format,
|
||||
**chat_template_kwargs,
|
||||
)
|
||||
engine_prompts = list[EnginePrompt]()
|
||||
for conversation in self._normalize_conversations(conversations):
|
||||
_, in_prompt = renderer.render_messages(conversation, chat_params)
|
||||
if mm_processor_kwargs is not None:
|
||||
prompt["mm_processor_kwargs"] = mm_processor_kwargs
|
||||
in_prompt["mm_processor_kwargs"] = mm_processor_kwargs
|
||||
|
||||
prompts.append(prompt)
|
||||
engine_prompts.append(renderer.tokenize_prompt(in_prompt, tok_params))
|
||||
|
||||
return prompts
|
||||
return engine_prompts
|
||||
|
||||
def chat(
|
||||
self,
|
||||
@@ -844,6 +948,7 @@ class LLM:
|
||||
continue_final_message: bool = False,
|
||||
tools: list[dict[str, Any]] | None = None,
|
||||
chat_template_kwargs: dict[str, Any] | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
mm_processor_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[RequestOutput]:
|
||||
"""
|
||||
@@ -889,22 +994,22 @@ class LLM:
|
||||
`True` if `add_generation_prompt` is also `True`.
|
||||
chat_template_kwargs: Additional kwargs to pass to the chat
|
||||
template.
|
||||
mm_processor_kwargs: Multimodal processor kwarg overrides for this
|
||||
chat request. Only used for offline requests.
|
||||
tokenization_kwargs: Overrides for `tokenizer.encode`.
|
||||
mm_processor_kwargs: Overrides for `processor.__call__`.
|
||||
|
||||
Returns:
|
||||
A list of `RequestOutput` objects containing the generated
|
||||
responses in the same order as the input messages.
|
||||
"""
|
||||
|
||||
prompts = self.preprocess_chat(
|
||||
messages=messages,
|
||||
prompts = self._preprocess_chat(
|
||||
messages,
|
||||
chat_template=chat_template,
|
||||
chat_template_content_format=chat_template_content_format,
|
||||
chat_template_kwargs=chat_template_kwargs,
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
continue_final_message=continue_final_message,
|
||||
tools=tools,
|
||||
chat_template_kwargs=chat_template_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
@@ -913,6 +1018,7 @@ class LLM:
|
||||
sampling_params=sampling_params,
|
||||
use_tqdm=use_tqdm,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
def encode(
|
||||
@@ -945,37 +1051,29 @@ class LLM:
|
||||
If `False`, no progress bar is created.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
pooling_task: Override the pooling task to use.
|
||||
tokenization_kwargs: overrides tokenization_kwargs set in
|
||||
pooling_params
|
||||
tokenization_kwargs: Overrides for `tokenizer.encode`.
|
||||
|
||||
Returns:
|
||||
A list of `PoolingRequestOutput` objects containing the
|
||||
pooled hidden states in the same order as the input prompts.
|
||||
|
||||
Note:
|
||||
Using `prompts` and `prompt_token_ids` as keyword parameters is
|
||||
considered legacy and may be deprecated in the future. You should
|
||||
instead pass them via the `inputs` parameter.
|
||||
"""
|
||||
|
||||
error_str = (
|
||||
"pooling_task required for `LLM.encode`\n"
|
||||
"Please use one of the more specific methods or set the "
|
||||
"pooling_task when using `LLM.encode`:\n"
|
||||
" - For embeddings, use `LLM.embed(...)` "
|
||||
'or `pooling_task="embed"`.\n'
|
||||
" - For classification logits, use `LLM.classify(...)` "
|
||||
'or `pooling_task="classify"`.\n'
|
||||
" - For similarity scores, use `LLM.score(...)`.\n"
|
||||
" - For rewards, use `LLM.reward(...)` "
|
||||
'or `pooling_task="token_classify"`\n'
|
||||
" - For token classification, "
|
||||
'use `pooling_task="token_classify"`\n'
|
||||
' - For multi-vector retrieval, use `pooling_task="token_embed"`'
|
||||
)
|
||||
|
||||
if pooling_task is None:
|
||||
raise ValueError(error_str)
|
||||
raise ValueError(
|
||||
"pooling_task required for `LLM.encode`\n"
|
||||
"Please use one of the more specific methods or set the "
|
||||
"pooling_task when using `LLM.encode`:\n"
|
||||
" - For embeddings, use `LLM.embed(...)` "
|
||||
'or `pooling_task="embed"`.\n'
|
||||
" - For classification logits, use `LLM.classify(...)` "
|
||||
'or `pooling_task="classify"`.\n'
|
||||
" - For similarity scores, use `LLM.score(...)`.\n"
|
||||
" - For rewards, use `LLM.reward(...)` "
|
||||
'or `pooling_task="token_classify"`\n'
|
||||
" - For token classification, "
|
||||
'use `pooling_task="token_classify"`\n'
|
||||
' - For multi-vector retrieval, use `pooling_task="token_embed"`'
|
||||
)
|
||||
|
||||
model_config = self.model_config
|
||||
runner_type = model_config.runner_type
|
||||
@@ -986,6 +1084,20 @@ class LLM:
|
||||
"pooling model."
|
||||
)
|
||||
|
||||
if truncate_prompt_tokens is not None:
|
||||
warnings.warn(
|
||||
"The `truncate_prompt_tokens` parameter in `LLM.encode()` "
|
||||
"is deprecated and will be removed in v0.16. "
|
||||
"Please pass it via `tokenization_kwargs` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
tokenization_kwargs = merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=truncate_prompt_tokens),
|
||||
)
|
||||
|
||||
io_processor_prompt = False
|
||||
if isinstance(prompts, dict) and "data" in prompts:
|
||||
io_processor_prompt = True
|
||||
@@ -1017,19 +1129,16 @@ class LLM:
|
||||
pooling_params = self.io_processor.validate_or_generate_params(
|
||||
pooling_params
|
||||
)
|
||||
else:
|
||||
if pooling_params is None:
|
||||
# Use default pooling params.
|
||||
pooling_params = PoolingParams()
|
||||
|
||||
if pooling_params is None:
|
||||
# Use default pooling params.
|
||||
pooling_params = PoolingParams()
|
||||
|
||||
if pooling_task not in self.supported_tasks:
|
||||
raise ValueError(f"pooling_task must be one of {self.supported_tasks}.")
|
||||
|
||||
for param in as_iter(pooling_params):
|
||||
param.verify(pooling_task, model_config)
|
||||
# for backwards compatibility
|
||||
if truncate_prompt_tokens is not None:
|
||||
param.truncate_prompt_tokens = truncate_prompt_tokens
|
||||
|
||||
self._validate_and_add_requests(
|
||||
prompts=prompts,
|
||||
@@ -1094,6 +1203,7 @@ class LLM:
|
||||
it is used to create the progress bar.
|
||||
If `False`, no progress bar is created.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
tokenization_kwargs: Overrides for `tokenizer.encode`.
|
||||
|
||||
Returns:
|
||||
A list of `EmbeddingRequestOutput` objects containing the
|
||||
@@ -1105,9 +1215,14 @@ class LLM:
|
||||
"Try converting the model using `--convert embed`."
|
||||
)
|
||||
|
||||
if truncate_prompt_tokens is not None:
|
||||
tokenization_kwargs = merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=truncate_prompt_tokens),
|
||||
)
|
||||
|
||||
items = self.encode(
|
||||
prompts,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
use_tqdm=use_tqdm,
|
||||
pooling_params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
@@ -1121,8 +1236,8 @@ class LLM:
|
||||
self,
|
||||
prompts: PromptType | Sequence[PromptType],
|
||||
*,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[ClassificationRequestOutput]:
|
||||
@@ -1137,13 +1252,15 @@ class LLM:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||
for more details about the format of each prompt.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||
it is used to create the progress bar.
|
||||
If `False`, no progress bar is created.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
tokenization_kwargs: Overrides for `tokenizer.encode`.
|
||||
|
||||
Returns:
|
||||
A list of `ClassificationRequestOutput` objects containing the
|
||||
embedding vectors in the same order as the input prompts.
|
||||
@@ -1170,9 +1287,9 @@ class LLM:
|
||||
prompts: PromptType | Sequence[PromptType],
|
||||
/,
|
||||
*,
|
||||
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
pooling_params: PoolingParams | Sequence[PoolingParams] | None = None,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[PoolingRequestOutput]:
|
||||
@@ -1183,13 +1300,15 @@ class LLM:
|
||||
prompts: The prompts to the LLM. You may pass a sequence of prompts
|
||||
for batch inference. See [PromptType][vllm.inputs.PromptType]
|
||||
for more details about the format of each prompt.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||
it is used to create the progress bar.
|
||||
If `False`, no progress bar is created.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
tokenization_kwargs: Overrides for `tokenizer.encode`.
|
||||
|
||||
Returns:
|
||||
A list of `PoolingRequestOutput` objects containing the
|
||||
pooled hidden states in the same order as the input prompts.
|
||||
@@ -1207,18 +1326,18 @@ class LLM:
|
||||
|
||||
def _embedding_score(
|
||||
self,
|
||||
tokenizer: TokenizerLike,
|
||||
text_1: list[str | TextPrompt | TokensPrompt],
|
||||
text_2: list[str | TextPrompt | TokensPrompt],
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
pooling_params: PoolingParams | None = None,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
text_1: list[SingletonPrompt],
|
||||
text_2: list[SingletonPrompt],
|
||||
*,
|
||||
use_tqdm: bool | Callable[..., tqdm],
|
||||
pooling_params: PoolingParams | None,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
) -> list[ScoringRequestOutput]:
|
||||
encoded_output: list[PoolingRequestOutput] = self.encode(
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
encoded_output = self.encode(
|
||||
text_1 + text_2,
|
||||
truncate_prompt_tokens=truncate_prompt_tokens,
|
||||
use_tqdm=use_tqdm,
|
||||
lora_request=lora_request,
|
||||
pooling_params=pooling_params,
|
||||
@@ -1226,14 +1345,16 @@ class LLM:
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
encoded_output_1: list[PoolingRequestOutput] = encoded_output[0 : len(text_1)]
|
||||
encoded_output_2: list[PoolingRequestOutput] = encoded_output[len(text_1) :]
|
||||
encoded_output_1 = encoded_output[0 : len(text_1)]
|
||||
encoded_output_2 = encoded_output[len(text_1) :]
|
||||
|
||||
if len(encoded_output_1) == 1:
|
||||
encoded_output_1 = encoded_output_1 * len(encoded_output_2)
|
||||
|
||||
scores = _cosine_similarity(
|
||||
tokenizer=tokenizer, embed_1=encoded_output_1, embed_2=encoded_output_2
|
||||
tokenizer=tokenizer,
|
||||
embed_1=encoded_output_1,
|
||||
embed_2=encoded_output_2,
|
||||
)
|
||||
|
||||
items = self.engine_class.validate_outputs(scores, PoolingRequestOutput)
|
||||
@@ -1241,17 +1362,17 @@ class LLM:
|
||||
|
||||
def _cross_encoding_score(
|
||||
self,
|
||||
tokenizer: TokenizerLike,
|
||||
data_1: list[str] | list[ScoreContentPartParam],
|
||||
data_2: list[str] | list[ScoreContentPartParam],
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
pooling_params: PoolingParams | None = None,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
score_template: str | None = None,
|
||||
*,
|
||||
use_tqdm: bool | Callable[..., tqdm],
|
||||
pooling_params: PoolingParams | None,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
score_template: str | None,
|
||||
) -> list[ScoringRequestOutput]:
|
||||
model_config = self.model_config
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("Score API is not supported for Mistral tokenizer")
|
||||
@@ -1265,13 +1386,6 @@ class LLM:
|
||||
pooling_params.verify("score", model_config)
|
||||
pooling_params_list = list[PoolingParams]()
|
||||
|
||||
local_kwargs = tokenization_kwargs or {}
|
||||
tokenization_kwargs = local_kwargs.copy()
|
||||
|
||||
_validate_truncation_size(
|
||||
model_config.max_model_len, truncate_prompt_tokens, tokenization_kwargs
|
||||
)
|
||||
|
||||
prompts = list[PromptType]()
|
||||
|
||||
input_pairs = [(t1, t2) for t1, t2 in zip(data_1, data_2)]
|
||||
@@ -1314,10 +1428,10 @@ class LLM:
|
||||
data_2: SingletonPrompt | Sequence[SingletonPrompt] | ScoreMultiModalParam,
|
||||
/,
|
||||
*,
|
||||
truncate_prompt_tokens: int | None = None,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
pooling_params: PoolingParams | None = None,
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
chat_template: str | None = None,
|
||||
) -> list[ScoringRequestOutput]:
|
||||
"""Generate similarity scores for all pairs `<text,text_pair>` or
|
||||
@@ -1344,20 +1458,22 @@ class LLM:
|
||||
the LLM. Can be text or multi-modal data. See [PromptType]
|
||||
[vllm.inputs.PromptType] for more details about the format of
|
||||
each prompt.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
use_tqdm: If `True`, shows a tqdm progress bar.
|
||||
If a callable (e.g., `functools.partial(tqdm, leave=False)`),
|
||||
it is used to create the progress bar.
|
||||
If `False`, no progress bar is created.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
pooling_params: The pooling parameters for pooling. If None, we
|
||||
use the default pooling parameters.
|
||||
chat_template: The chat template to use for the scoring. If None, we
|
||||
use the model's default chat template.
|
||||
tokenization_kwargs: Overrides for `tokenizer.encode`.
|
||||
Returns:
|
||||
A list of `ScoringRequestOutput` objects containing the
|
||||
generated scores in the same order as the input prompts.
|
||||
"""
|
||||
model_config = self.model_config
|
||||
|
||||
runner_type = model_config.runner_type
|
||||
if runner_type != "pooling":
|
||||
raise ValueError(
|
||||
@@ -1445,26 +1561,27 @@ class LLM:
|
||||
|
||||
_validate_score_input_lens(data_1, data_2) # type: ignore[arg-type]
|
||||
|
||||
tok_params = self._get_cmpl_tok_params(tokenization_kwargs)
|
||||
encode_kwargs = tok_params.get_encode_kwargs()
|
||||
|
||||
if model_config.is_cross_encoder:
|
||||
return self._cross_encoding_score(
|
||||
tokenizer,
|
||||
data_1, # type: ignore[arg-type]
|
||||
data_2, # type: ignore[arg-type]
|
||||
truncate_prompt_tokens,
|
||||
use_tqdm,
|
||||
pooling_params,
|
||||
lora_request,
|
||||
use_tqdm=use_tqdm,
|
||||
pooling_params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=encode_kwargs,
|
||||
score_template=chat_template,
|
||||
)
|
||||
else:
|
||||
return self._embedding_score(
|
||||
tokenizer,
|
||||
data_1, # type: ignore[arg-type]
|
||||
data_2, # type: ignore[arg-type]
|
||||
truncate_prompt_tokens,
|
||||
use_tqdm,
|
||||
pooling_params,
|
||||
lora_request,
|
||||
use_tqdm=use_tqdm,
|
||||
pooling_params=pooling_params,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=encode_kwargs,
|
||||
)
|
||||
|
||||
def start_profile(self) -> None:
|
||||
@@ -1530,42 +1647,79 @@ class LLM:
|
||||
|
||||
def _validate_and_add_requests(
|
||||
self,
|
||||
prompts: PromptType | Sequence[PromptType] | DataPrompt,
|
||||
prompts: PromptType | Sequence[PromptType],
|
||||
params: SamplingParams
|
||||
| Sequence[SamplingParams]
|
||||
| PoolingParams
|
||||
| Sequence[PoolingParams],
|
||||
*,
|
||||
use_tqdm: bool | Callable[..., tqdm] = True,
|
||||
lora_request: Sequence[LoRARequest] | LoRARequest | None,
|
||||
priority: list[int] | None = None,
|
||||
lora_request: Sequence[LoRARequest | None] | LoRARequest | None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
priority: list[int] | None = None,
|
||||
) -> None:
|
||||
if isinstance(prompts, (str, dict)):
|
||||
# Convert a single prompt to a list.
|
||||
prompts = [prompts] # type: ignore[list-item]
|
||||
in_prompts = self._normalize_prompts(prompts)
|
||||
num_requests = len(in_prompts)
|
||||
|
||||
num_requests = len(prompts)
|
||||
if isinstance(params, Sequence) and len(params) != num_requests:
|
||||
raise ValueError("The lengths of prompts and params must be the same.")
|
||||
if isinstance(lora_request, Sequence) and len(lora_request) != num_requests:
|
||||
raise ValueError(
|
||||
"The lengths of prompts and lora_request must be the same."
|
||||
)
|
||||
if priority is not None and len(priority) != num_requests:
|
||||
raise ValueError(
|
||||
"The lengths of prompts "
|
||||
f"({num_requests}) and priority ({len(priority)}) "
|
||||
"must be the same."
|
||||
if isinstance(params, Sequence):
|
||||
if len(params) != num_requests:
|
||||
raise ValueError(
|
||||
f"The lengths of prompts ({params}) "
|
||||
f"and lora_request ({len(params)}) must be the same."
|
||||
)
|
||||
|
||||
engine_params = params
|
||||
else:
|
||||
engine_params = [params] * num_requests
|
||||
|
||||
if isinstance(lora_request, Sequence):
|
||||
if len(lora_request) != num_requests:
|
||||
raise ValueError(
|
||||
f"The lengths of prompts ({num_requests}) "
|
||||
f"and lora_request ({len(lora_request)}) must be the same."
|
||||
)
|
||||
|
||||
engine_lora_requests: Sequence[LoRARequest | None] = lora_request
|
||||
else:
|
||||
engine_lora_requests = [lora_request] * num_requests
|
||||
|
||||
if priority is not None:
|
||||
if len(priority) != num_requests:
|
||||
raise ValueError(
|
||||
f"The lengths of prompts ({num_requests}) "
|
||||
f"and priority ({len(priority)}) must be the same."
|
||||
)
|
||||
else:
|
||||
priority = [0] * num_requests
|
||||
|
||||
if any(param.truncate_prompt_tokens is not None for param in engine_params):
|
||||
# TODO: Remove this after deprecating `param.truncate_prompt_tokens`
|
||||
# Then, move the code from the `else` block to the top and let
|
||||
# `self._preprocess_completion` handle prompt normalization
|
||||
engine_prompts = [
|
||||
engine_prompt
|
||||
for in_prompt, param in zip(in_prompts, engine_params)
|
||||
for engine_prompt in self._preprocess_completion(
|
||||
[in_prompt],
|
||||
tokenization_kwargs=merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=param.truncate_prompt_tokens),
|
||||
),
|
||||
)
|
||||
]
|
||||
else:
|
||||
engine_prompts = self._preprocess_completion(
|
||||
in_prompts,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
for sp in params if isinstance(params, Sequence) else (params,):
|
||||
for sp in engine_params:
|
||||
if isinstance(sp, SamplingParams):
|
||||
# We only care about the final output
|
||||
sp.output_kind = RequestOutputKind.FINAL_ONLY
|
||||
|
||||
# Add requests to the engine.
|
||||
it = prompts
|
||||
it = engine_prompts
|
||||
if use_tqdm:
|
||||
tqdm_func = use_tqdm if callable(use_tqdm) else tqdm
|
||||
it = tqdm_func(it, desc="Adding requests")
|
||||
@@ -1576,12 +1730,10 @@ class LLM:
|
||||
for i, prompt in enumerate(it):
|
||||
request_id = self._add_request(
|
||||
prompt,
|
||||
params[i] if isinstance(params, Sequence) else params,
|
||||
lora_request=lora_request[i]
|
||||
if isinstance(lora_request, Sequence)
|
||||
else lora_request,
|
||||
priority=priority[i] if priority else 0,
|
||||
engine_params[i],
|
||||
lora_request=engine_lora_requests[i],
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
priority=priority[i],
|
||||
)
|
||||
added_request_ids.append(request_id)
|
||||
except Exception as e:
|
||||
@@ -1589,54 +1741,42 @@ class LLM:
|
||||
self.llm_engine.abort_request(added_request_ids, internal=True)
|
||||
raise e
|
||||
|
||||
def _process_inputs(
|
||||
self,
|
||||
request_id: str,
|
||||
engine_prompt: PromptType,
|
||||
params: SamplingParams | PoolingParams,
|
||||
*,
|
||||
lora_request: LoRARequest | None,
|
||||
priority: int,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[EngineCoreRequest, dict[str, Any]]:
|
||||
"""Use the Processor to process inputs for LLMEngine."""
|
||||
|
||||
local_kwargs = tokenization_kwargs or {}
|
||||
tokenization_kwargs = local_kwargs.copy()
|
||||
_validate_truncation_size(
|
||||
self.model_config.max_model_len,
|
||||
params.truncate_prompt_tokens,
|
||||
tokenization_kwargs,
|
||||
)
|
||||
|
||||
engine_request = self.input_processor.process_inputs(
|
||||
request_id,
|
||||
engine_prompt,
|
||||
params,
|
||||
lora_request=lora_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
priority=priority,
|
||||
)
|
||||
return engine_request, tokenization_kwargs
|
||||
|
||||
def _add_request(
|
||||
self,
|
||||
prompt: PromptType,
|
||||
params: SamplingParams | PoolingParams,
|
||||
lora_request: LoRARequest | None = None,
|
||||
priority: int = 0,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
priority: int = 0,
|
||||
) -> str:
|
||||
prompt_text, _, _ = get_prompt_components(prompt)
|
||||
request_id = str(next(self.request_counter))
|
||||
|
||||
engine_request, tokenization_kwargs = self._process_inputs(
|
||||
if params.truncate_prompt_tokens is not None:
|
||||
params_type = type(params).__name__
|
||||
warnings.warn(
|
||||
f"The `truncate_prompt_tokens` parameter in `{params_type}` "
|
||||
"is deprecated and will be removed in v0.16. "
|
||||
"Please pass it via `tokenization_kwargs` instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
tokenization_kwargs = merge_kwargs(
|
||||
tokenization_kwargs,
|
||||
dict(truncate_prompt_tokens=params.truncate_prompt_tokens),
|
||||
)
|
||||
|
||||
tok_params = self._get_cmpl_tok_params(tokenization_kwargs)
|
||||
|
||||
tokenization_kwargs = tok_params.get_encode_kwargs()
|
||||
engine_request = self.input_processor.process_inputs(
|
||||
request_id,
|
||||
prompt,
|
||||
params,
|
||||
lora_request=lora_request,
|
||||
priority=priority,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
priority=priority,
|
||||
)
|
||||
|
||||
self.llm_engine.add_request(
|
||||
|
||||
Reference in New Issue
Block a user