[Core][VLM] Add precise multi-modal placeholder tracking (#8346)

Signed-off-by: Peter Salas <peter@fixie.ai>
This commit is contained in:
Peter Salas
2024-11-01 16:21:10 -07:00
committed by GitHub
parent d151fde834
commit 6c0b7f548d
53 changed files with 913 additions and 281 deletions

View File

@@ -15,13 +15,13 @@ import torch
from vllm.inputs.parse import is_encoder_decoder_inputs
from vllm.lora.request import LoRARequest
from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
from vllm.pooling_params import PoolingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sampling_params import RequestOutputKind, SamplingParams
if TYPE_CHECKING:
from vllm.inputs import SingletonInputs
from vllm.multimodal.base import MultiModalDataDict
VLLM_TOKEN_ID_ARRAY_TYPE = "l"
@@ -485,7 +485,7 @@ class Sequence:
return cast(List[int], self.inputs.get(prompt_token_ids_key))
@property
def multi_modal_data(self) -> "MultiModalDataDict":
def multi_modal_data(self) -> MultiModalDataDict:
inputs = self.inputs
if (inputs.get("multi_modal_data")
@@ -495,11 +495,15 @@ class Sequence:
)
return cast(
"MultiModalDataDict",
MultiModalDataDict,
(inputs.get("multi_modal_data")
or inputs.get("encoder_multi_modal_data") or {}),
)
@property
def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
return self.inputs.get("multi_modal_placeholders") or {}
@property
def mm_processor_kwargs(self) -> Dict[str, Any]:
return self.inputs.get("mm_processor_kwargs") or {}
@@ -728,9 +732,13 @@ class SequenceGroup:
if self.encoder_seq is not None else None)
@property
def multi_modal_data(self) -> "MultiModalDataDict":
def multi_modal_data(self) -> MultiModalDataDict:
return self.first_seq.multi_modal_data
@property
def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
return self.first_seq.multi_modal_placeholders
@property
def mm_processor_kwargs(self) -> Dict[str, Any]:
return self.first_seq.mm_processor_kwargs
@@ -946,6 +954,7 @@ class SequenceGroupMetadata(
# "MultiModalDataDict" types. We have to use Any due to msgspec
# doesn't allow to have union of 2 different dicts.
multi_modal_data: Optional[Any] = None
multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
mm_processor_kwargs: Optional[Dict[str, Any]] = None
encoder_seq_data: Optional[SequenceData] = None
cross_block_table: Optional[List[int]] = None