[Core][VLM] Add precise multi-modal placeholder tracking (#8346)
Signed-off-by: Peter Salas <peter@fixie.ai>
This commit is contained in:
@@ -15,13 +15,13 @@ import torch
|
||||
|
||||
from vllm.inputs.parse import is_encoder_decoder_inputs
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
|
||||
from vllm.pooling_params import PoolingParams
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
from vllm.sampling_params import RequestOutputKind, SamplingParams
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.inputs import SingletonInputs
|
||||
from vllm.multimodal.base import MultiModalDataDict
|
||||
|
||||
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
|
||||
|
||||
@@ -485,7 +485,7 @@ class Sequence:
|
||||
return cast(List[int], self.inputs.get(prompt_token_ids_key))
|
||||
|
||||
@property
|
||||
def multi_modal_data(self) -> "MultiModalDataDict":
|
||||
def multi_modal_data(self) -> MultiModalDataDict:
|
||||
inputs = self.inputs
|
||||
|
||||
if (inputs.get("multi_modal_data")
|
||||
@@ -495,11 +495,15 @@ class Sequence:
|
||||
)
|
||||
|
||||
return cast(
|
||||
"MultiModalDataDict",
|
||||
MultiModalDataDict,
|
||||
(inputs.get("multi_modal_data")
|
||||
or inputs.get("encoder_multi_modal_data") or {}),
|
||||
)
|
||||
|
||||
@property
|
||||
def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
|
||||
return self.inputs.get("multi_modal_placeholders") or {}
|
||||
|
||||
@property
|
||||
def mm_processor_kwargs(self) -> Dict[str, Any]:
|
||||
return self.inputs.get("mm_processor_kwargs") or {}
|
||||
@@ -728,9 +732,13 @@ class SequenceGroup:
|
||||
if self.encoder_seq is not None else None)
|
||||
|
||||
@property
|
||||
def multi_modal_data(self) -> "MultiModalDataDict":
|
||||
def multi_modal_data(self) -> MultiModalDataDict:
|
||||
return self.first_seq.multi_modal_data
|
||||
|
||||
@property
|
||||
def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
|
||||
return self.first_seq.multi_modal_placeholders
|
||||
|
||||
@property
|
||||
def mm_processor_kwargs(self) -> Dict[str, Any]:
|
||||
return self.first_seq.mm_processor_kwargs
|
||||
@@ -946,6 +954,7 @@ class SequenceGroupMetadata(
|
||||
# "MultiModalDataDict" types. We have to use Any due to msgspec
|
||||
# doesn't allow to have union of 2 different dicts.
|
||||
multi_modal_data: Optional[Any] = None
|
||||
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None
|
||||
encoder_seq_data: Optional[SequenceData] = None
|
||||
cross_block_table: Optional[List[int]] = None
|
||||
|
||||
Reference in New Issue
Block a user