[Chore] Remove redundant RequestPrompt (#30612)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -12,9 +12,7 @@ import torch
|
||||
from pydantic import Field
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt
|
||||
from vllm.inputs.data import TextPrompt as EngineTextPrompt
|
||||
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
|
||||
from vllm.inputs.data import EmbedsPrompt, TextPrompt, TokensPrompt
|
||||
from vllm.inputs.parse import get_prompt_components, parse_raw_prompts
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.utils.async_utils import AsyncMicrobatchTokenizer
|
||||
@@ -97,7 +95,7 @@ class BaseRenderer(ABC):
|
||||
*,
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
|
||||
config: RenderConfig,
|
||||
) -> list[EngineTokensPrompt]:
|
||||
) -> list[TokensPrompt]:
|
||||
"""
|
||||
Convert text or token inputs into engine-ready TokensPrompt objects.
|
||||
|
||||
@@ -115,7 +113,7 @@ class BaseRenderer(ABC):
|
||||
(e.g., tokenization and length handling).
|
||||
|
||||
Returns:
|
||||
list[EngineTokensPrompt]: Engine-ready token prompts.
|
||||
list[TokensPrompt]: Engine-ready token prompts.
|
||||
|
||||
Raises:
|
||||
ValueError: If input formats are invalid or length limits exceeded.
|
||||
@@ -129,7 +127,7 @@ class BaseRenderer(ABC):
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
|
||||
prompt_embeds: bytes | list[bytes] | None = None,
|
||||
config: RenderConfig,
|
||||
) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
|
||||
) -> list[TokensPrompt | EmbedsPrompt]:
|
||||
"""
|
||||
Convert text/token and/or base64-encoded embeddings inputs into
|
||||
engine-ready prompt objects using a unified RenderConfig.
|
||||
@@ -146,7 +144,7 @@ class BaseRenderer(ABC):
|
||||
(e.g., tokenization and length handling).
|
||||
|
||||
Returns:
|
||||
list[Union[EngineTokensPrompt, EngineEmbedsPrompt]]:
|
||||
list[Union[TokensPrompt, EmbedsPrompt]]:
|
||||
Engine-ready prompt objects.
|
||||
|
||||
Raises:
|
||||
@@ -161,14 +159,14 @@ class BaseRenderer(ABC):
|
||||
prompt_embeds: bytes | list[bytes],
|
||||
truncate_prompt_tokens: Annotated[int, Field(ge=0)] | None = None,
|
||||
cache_salt: str | None = None,
|
||||
) -> list[EngineEmbedsPrompt]:
|
||||
) -> list[EmbedsPrompt]:
|
||||
"""Load and validate base64-encoded embeddings into prompt objects."""
|
||||
if not self.model_config.enable_prompt_embeds:
|
||||
raise ValueError(
|
||||
"You must set `--enable-prompt-embeds` to input `prompt_embeds`."
|
||||
)
|
||||
|
||||
def _load_and_validate_embed(embed: bytes) -> EngineEmbedsPrompt:
|
||||
def _load_and_validate_embed(embed: bytes) -> EmbedsPrompt:
|
||||
tensor = torch.load(
|
||||
io.BytesIO(pybase64.b64decode(embed, validate=True)),
|
||||
weights_only=True,
|
||||
@@ -185,7 +183,7 @@ class BaseRenderer(ABC):
|
||||
assert tensor.dim() == 2
|
||||
if truncate_prompt_tokens is not None:
|
||||
tensor = tensor[-truncate_prompt_tokens:]
|
||||
embeds_prompt = EngineEmbedsPrompt(prompt_embeds=tensor)
|
||||
embeds_prompt = EmbedsPrompt(prompt_embeds=tensor)
|
||||
if cache_salt is not None:
|
||||
embeds_prompt["cache_salt"] = cache_salt
|
||||
return embeds_prompt
|
||||
@@ -213,7 +211,7 @@ class CompletionRenderer(BaseRenderer):
|
||||
*,
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]],
|
||||
config: RenderConfig,
|
||||
) -> list[EngineTokensPrompt]:
|
||||
) -> list[TokensPrompt]:
|
||||
"""Implementation of prompt rendering for completion-style requests.
|
||||
|
||||
Uses async tokenizer pooling for improved performance. See base class
|
||||
@@ -240,7 +238,7 @@ class CompletionRenderer(BaseRenderer):
|
||||
prompt_or_prompts: str | list[str] | list[int] | list[list[int]] | None = None,
|
||||
prompt_embeds: bytes | list[bytes] | None = None,
|
||||
config: RenderConfig,
|
||||
) -> list[EngineTokensPrompt | EngineEmbedsPrompt]:
|
||||
) -> list[TokensPrompt | EmbedsPrompt]:
|
||||
"""
|
||||
Render text/token prompts and/or precomputed embedding prompts. At
|
||||
least one of `prompt_or_prompts` or `prompt_embeds` must be provided.
|
||||
@@ -249,7 +247,7 @@ class CompletionRenderer(BaseRenderer):
|
||||
if truncate_prompt_tokens == 0:
|
||||
return []
|
||||
|
||||
rendered: list[EngineTokensPrompt | EngineEmbedsPrompt] = []
|
||||
rendered: list[TokensPrompt | EmbedsPrompt] = []
|
||||
|
||||
if prompt_embeds is not None:
|
||||
rendered.extend(
|
||||
@@ -281,10 +279,10 @@ class CompletionRenderer(BaseRenderer):
|
||||
|
||||
async def _create_prompt(
|
||||
self,
|
||||
prompt_input: EngineTextPrompt | EngineTokensPrompt,
|
||||
prompt_input: TextPrompt | TokensPrompt,
|
||||
config: RenderConfig,
|
||||
truncate_prompt_tokens: int | None,
|
||||
) -> EngineTokensPrompt:
|
||||
) -> TokensPrompt:
|
||||
prompt, prompt_token_ids, _ = get_prompt_components(prompt_input)
|
||||
|
||||
if prompt_token_ids is not None:
|
||||
@@ -317,7 +315,7 @@ class CompletionRenderer(BaseRenderer):
|
||||
truncate_prompt_tokens: int | None,
|
||||
add_special_tokens: bool,
|
||||
cache_salt: str | None,
|
||||
) -> EngineTokensPrompt:
|
||||
) -> TokensPrompt:
|
||||
"""Tokenize text input asynchronously."""
|
||||
async_tokenizer = self._get_async_tokenizer()
|
||||
|
||||
@@ -350,7 +348,7 @@ class CompletionRenderer(BaseRenderer):
|
||||
truncate_prompt_tokens: int | None,
|
||||
cache_salt: str | None,
|
||||
needs_detokenization: bool | None = False,
|
||||
) -> EngineTokensPrompt:
|
||||
) -> TokensPrompt:
|
||||
"""Optionally detokenize token IDs and build a tokens prompt."""
|
||||
token_ids = self._maybe_apply_truncation(token_ids, truncate_prompt_tokens)
|
||||
|
||||
@@ -392,8 +390,8 @@ class CompletionRenderer(BaseRenderer):
|
||||
max_length: int | None = None,
|
||||
cache_salt: str | None = None,
|
||||
prompt: str | None = None,
|
||||
) -> EngineTokensPrompt:
|
||||
"""Create validated EngineTokensPrompt."""
|
||||
) -> TokensPrompt:
|
||||
"""Create validated TokensPrompt."""
|
||||
if max_length is not None and len(token_ids) > max_length:
|
||||
raise ValueError(
|
||||
f"This model's maximum context length is {max_length} tokens. "
|
||||
@@ -401,7 +399,7 @@ class CompletionRenderer(BaseRenderer):
|
||||
"Please reduce the length of the input messages."
|
||||
)
|
||||
|
||||
tokens_prompt = EngineTokensPrompt(prompt_token_ids=token_ids)
|
||||
tokens_prompt = TokensPrompt(prompt_token_ids=token_ids)
|
||||
if cache_salt is not None:
|
||||
tokens_prompt["cache_salt"] = cache_salt
|
||||
if prompt is not None:
|
||||
|
||||
Reference in New Issue
Block a user