[CORE] Prompt Embeddings Support for v1 Engine (#24278)

Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Andrew Sansom <qthequartermasterman@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
Andrew Sansom
2025-09-18 19:03:09 -05:00
committed by GitHub
parent 9fac6aa30b
commit 9a4600e4dc
20 changed files with 305 additions and 76 deletions

View File

@@ -7,9 +7,12 @@ from collections.abc import Mapping
from functools import partial
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
import torch
from vllm.multimodal.inputs import MultiModalFeatureSpec
from vllm.pooling_params import PoolingParams
from vllm.sampling_params import SamplingParams
from vllm.utils import length_from_prompt_token_ids_or_embeds
from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType,
EngineCoreRequest, FinishReason)
from vllm.v1.structured_output.request import StructuredOutputRequest
@@ -25,12 +28,13 @@ class Request:
def __init__(
self,
request_id: str,
prompt_token_ids: list[int],
prompt_token_ids: Optional[list[int]],
sampling_params: Optional[SamplingParams],
pooling_params: Optional[PoolingParams],
eos_token_id: Optional[int],
client_index: int = 0,
arrival_time: Optional[float] = None,
prompt_embeds: Optional[torch.Tensor] = None,
mm_features: Optional[list[MultiModalFeatureSpec]] = None,
lora_request: Optional["LoRARequest"] = None,
structured_output_request: Optional["StructuredOutputRequest"] = None,
@@ -79,9 +83,13 @@ class Request:
"sampling_params and pooling_params can't both be unset")
self.prompt_token_ids = prompt_token_ids
self.num_prompt_tokens = len(self.prompt_token_ids)
self.prompt_embeds = prompt_embeds
self.num_prompt_tokens = length_from_prompt_token_ids_or_embeds(
prompt_token_ids, prompt_embeds)
self._output_token_ids: list[int] = []
self._all_token_ids: list[int] = self.prompt_token_ids.copy()
self._all_token_ids: list[int] = self.prompt_token_ids.copy(
) if self.prompt_token_ids is not None else [0
] * self.num_prompt_tokens
self.num_output_placeholders = 0 # Used in async scheduling.
self.spec_token_ids: list[int] = []
self.num_computed_tokens = 0
@@ -123,6 +131,7 @@ class Request:
request_id=request.request_id,
client_index=request.client_index,
prompt_token_ids=request.prompt_token_ids,
prompt_embeds=request.prompt_embeds,
mm_features=request.mm_features,
sampling_params=request.sampling_params,
pooling_params=request.pooling_params,