[Refactor] Decouple TimingContext from InputProcessingContext (#35083)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -41,15 +41,16 @@ from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
ImageSize,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseDummyInputsBuilder,
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
ProcessorInputs,
|
||||
PromptIndexTargets,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
@@ -204,23 +205,20 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None = None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
|
||||
tokenization_kwargs: Mapping[str, object] | None = None,
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> MultiModalInputs:
|
||||
if mm_items:
|
||||
if isinstance(prompt, str):
|
||||
if len(prompt) > 0:
|
||||
if inputs.mm_data_items:
|
||||
if isinstance(inputs.prompt, str):
|
||||
if len(inputs.prompt) > 0:
|
||||
raise ValueError(
|
||||
"CLIP accepts text-only or image-only inputs, not both! "
|
||||
"You must pass an image with an empty text prompt."
|
||||
)
|
||||
else:
|
||||
special_tokens = self.info.get_tokenizer().all_special_ids
|
||||
if all(tok in special_tokens for tok in prompt):
|
||||
prompt = []
|
||||
if all(tok in special_tokens for tok in inputs.prompt):
|
||||
inputs.prompt = []
|
||||
else:
|
||||
raise ValueError(
|
||||
"CLIP accepts text-only or image-only inputs, not both! "
|
||||
@@ -229,18 +227,12 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
|
||||
|
||||
# For multi-modal data, the prompt after processing should
|
||||
# only contain the dummy image tokens
|
||||
tokenization_kwargs = {
|
||||
**(tokenization_kwargs or {}),
|
||||
inputs.tokenization_kwargs = {
|
||||
**inputs.tokenization_kwargs,
|
||||
"add_special_tokens": False,
|
||||
}
|
||||
|
||||
return super().apply(
|
||||
prompt=prompt,
|
||||
mm_items=mm_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
return super().apply(inputs, timing_ctx)
|
||||
|
||||
def _hf_processor_applies_updates(
|
||||
self,
|
||||
|
||||
@@ -30,15 +30,16 @@ from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
ImageSize,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import BaseDummyInputsBuilder
|
||||
from vllm.multimodal.processing.processor import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
MultiModalProcessingInfo,
|
||||
ProcessorInputs,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
@@ -310,32 +311,17 @@ class DeepseekVL2MultiModalProcessor(
|
||||
|
||||
def _cached_apply_hf_processor(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
# The processor logic is different for len(images) <= 2 vs > 2
|
||||
# Since the processing cache assumes that the processor output is
|
||||
# invariant of how many images are passed per prompt, we only
|
||||
# perform caching for the most common case
|
||||
if mm_data_items.get_count("image", strict=False) > 2:
|
||||
return self._apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
if inputs.mm_data_items.get_count("image", strict=False) > 2:
|
||||
return self._apply_hf_processor(inputs, timing_ctx)
|
||||
|
||||
return super()._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
return super()._cached_apply_hf_processor(inputs, timing_ctx)
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
|
||||
@@ -21,13 +21,14 @@ from vllm.multimodal.parse import (
|
||||
ImageEmbeddingItems,
|
||||
ImageProcessorItems,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing.processor import (
|
||||
MultiModalProcessingInfo,
|
||||
ProcessorInputs,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
|
||||
@@ -490,32 +491,17 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
|
||||
|
||||
def _cached_apply_hf_processor(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
# The processor logic is different for len(images) <= 1 vs > 1
|
||||
# Since the processing cache assumes that the processor output is
|
||||
# invariant of how many images are passed per prompt, we only
|
||||
# perform caching for the most common case
|
||||
if mm_data_items.get_count("image", strict=False) > 1:
|
||||
return self._apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
if inputs.mm_data_items.get_count("image", strict=False) > 1:
|
||||
return self._apply_hf_processor(inputs, timing_ctx)
|
||||
|
||||
return super()._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
return super()._cached_apply_hf_processor(inputs, timing_ctx)
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_processor(
|
||||
|
||||
@@ -37,16 +37,17 @@ from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
ImageSize,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseDummyInputsBuilder,
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
InputProcessingContext,
|
||||
ProcessorInputs,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
@@ -770,11 +771,8 @@ class MantisProcessingInfo(LlavaProcessingInfo):
|
||||
class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
def apply(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None = None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
|
||||
tokenization_kwargs: Mapping[str, object] | None = None,
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> MultiModalInputs:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
@@ -785,15 +783,9 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
image_height=-1,
|
||||
)
|
||||
|
||||
result = super().apply(
|
||||
prompt,
|
||||
mm_items,
|
||||
mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
result = super().apply(inputs, timing_ctx)
|
||||
|
||||
mm_item_counts = mm_items.get_all_counts()
|
||||
mm_item_counts = inputs.mm_data_items.get_all_counts()
|
||||
mm_kwargs = result["mm_kwargs"]
|
||||
mm_hashes = result["mm_hashes"]
|
||||
|
||||
@@ -825,8 +817,8 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
)
|
||||
|
||||
orig_repls = self._get_mm_prompt_updates(
|
||||
mm_items,
|
||||
hf_processor_mm_kwargs,
|
||||
inputs.mm_data_items,
|
||||
inputs.hf_processor_mm_kwargs,
|
||||
mm_kwargs,
|
||||
)
|
||||
mm_placeholders = self._find_mm_placeholders(prompt_ids, orig_repls)
|
||||
|
||||
@@ -21,16 +21,17 @@ from vllm.multimodal.parse import (
|
||||
ImageEmbeddingItems,
|
||||
ImageProcessorItems,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseDummyInputsBuilder,
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
ProcessorInputs,
|
||||
PromptIndexTargets,
|
||||
PromptInsertion,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.renderers import TokenizeParams
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@@ -228,19 +229,10 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None = None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
|
||||
tokenization_kwargs: Mapping[str, object] | None = None,
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> MultiModalInputs:
|
||||
mm_inputs = super().apply(
|
||||
prompt,
|
||||
mm_items,
|
||||
mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
mm_inputs = super().apply(inputs, timing_ctx)
|
||||
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
@@ -50,16 +50,17 @@ from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
ImageSize,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.multimodal.processing import BaseDummyInputsBuilder
|
||||
from vllm.multimodal.processing.processor import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
MultiModalProcessingInfo,
|
||||
ProcessorInputs,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
PromptUpdateDetails,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@@ -277,7 +278,6 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||
dummy_text = self.get_dummy_text(mm_counts)
|
||||
dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options)
|
||||
dummy_images = dummy_mm_data.get("image", [])
|
||||
tokenization_kwargs = {"truncation": False}
|
||||
|
||||
request = ChatCompletionRequest(
|
||||
messages=[
|
||||
@@ -294,11 +294,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||
|
||||
dummy_mm_items = self.info.parse_mm_data(dummy_mm_data)
|
||||
|
||||
return ProcessorInputs(
|
||||
prompt=dummy_tokens,
|
||||
mm_items=dummy_mm_items,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
return ProcessorInputs(prompt=dummy_tokens, mm_data_items=dummy_mm_items)
|
||||
|
||||
|
||||
class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]):
|
||||
@@ -344,19 +340,10 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
|
||||
|
||||
def _cached_apply_hf_processor(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(inputs, timing_ctx)
|
||||
|
||||
# NOTE: The tokens are already inserted by the chat template
|
||||
return prompt_ids, mm_info, True
|
||||
|
||||
@@ -47,15 +47,16 @@ from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
ImageSize,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseDummyInputsBuilder,
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
ProcessorInputs,
|
||||
PromptIndexTargets,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.utils.tensor_schema import TensorSchema, TensorShape
|
||||
@@ -190,23 +191,20 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None = None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
|
||||
tokenization_kwargs: Mapping[str, object] | None = None,
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> MultiModalInputs:
|
||||
if mm_items:
|
||||
if isinstance(prompt, str):
|
||||
if len(prompt) > 0:
|
||||
if inputs.mm_data_items:
|
||||
if isinstance(inputs.prompt, str):
|
||||
if len(inputs.prompt) > 0:
|
||||
raise ValueError(
|
||||
"SigLIP accepts text-only or image-only inputs, not both! "
|
||||
"You must pass an image with an empty text prompt."
|
||||
)
|
||||
else:
|
||||
special_tokens = self.info.get_tokenizer().all_special_ids
|
||||
if all(tok in special_tokens for tok in prompt):
|
||||
prompt = []
|
||||
if all(tok in special_tokens for tok in inputs.prompt):
|
||||
inputs.prompt = []
|
||||
else:
|
||||
raise ValueError(
|
||||
"SigLIP accepts text-only or image-only inputs, not both! "
|
||||
@@ -214,19 +212,13 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
|
||||
)
|
||||
|
||||
# For multi-modal data, the prompt after processing should
|
||||
# only contain the image token
|
||||
tokenization_kwargs = {
|
||||
**(tokenization_kwargs or {}),
|
||||
# only contain the dummy image tokens
|
||||
inputs.tokenization_kwargs = {
|
||||
**inputs.tokenization_kwargs,
|
||||
"add_special_tokens": False,
|
||||
}
|
||||
|
||||
return super().apply(
|
||||
prompt=prompt,
|
||||
mm_items=mm_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
return super().apply(inputs, timing_ctx)
|
||||
|
||||
def _hf_processor_applies_updates(
|
||||
self,
|
||||
|
||||
@@ -54,13 +54,14 @@ from vllm.multimodal.parse import (
|
||||
ModalityDataItems,
|
||||
MultiModalDataItems,
|
||||
MultiModalDataParser,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseDummyInputsBuilder,
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
ProcessorInputs,
|
||||
PromptUpdate,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
|
||||
@@ -193,29 +194,21 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None = None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
|
||||
tokenization_kwargs: Mapping[str, object] | None = None,
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> MultiModalInputs:
|
||||
if hf_processor_mm_kwargs is None:
|
||||
hf_processor_mm_kwargs = {}
|
||||
if tokenization_kwargs is None:
|
||||
tokenization_kwargs = {}
|
||||
mm_items = inputs.mm_data_items
|
||||
hf_processor_mm_kwargs = inputs.hf_processor_mm_kwargs
|
||||
|
||||
mm_hashes = self._hash_mm_items(
|
||||
mm_items,
|
||||
mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
)
|
||||
|
||||
_, passthrough_data = self._get_hf_mm_data(mm_items)
|
||||
mm_processed_data = BatchFeature(
|
||||
{k: torch.as_tensor(v).unsqueeze(0) for k, v in passthrough_data.items()},
|
||||
tensor_type="pt",
|
||||
)
|
||||
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
|
||||
with timing_ctx.record("apply_hf_processor"):
|
||||
_, passthrough_data = self._get_hf_mm_data(mm_items)
|
||||
mm_processed_data = BatchFeature(
|
||||
{
|
||||
k: torch.as_tensor(v).unsqueeze(0)
|
||||
for k, v in passthrough_data.items()
|
||||
},
|
||||
tensor_type="pt",
|
||||
)
|
||||
|
||||
mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
|
||||
mm_processed_data,
|
||||
@@ -226,6 +219,11 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
|
||||
),
|
||||
)
|
||||
|
||||
with timing_ctx.record("get_mm_hashes"):
|
||||
mm_hashes = inputs.get_mm_hashes(self.info.model_id)
|
||||
|
||||
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
|
||||
|
||||
return mm_inputs(
|
||||
prompt_token_ids=[1],
|
||||
mm_kwargs=mm_kwargs,
|
||||
|
||||
@@ -37,12 +37,13 @@ from vllm.multimodal.inputs import (
|
||||
from vllm.multimodal.parse import (
|
||||
ImageProcessorItems,
|
||||
MultiModalDataItems,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import (
|
||||
BaseDummyInputsBuilder,
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
ProcessorInputs,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.sequence import IntermediateTensors
|
||||
@@ -177,11 +178,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
||||
|
||||
def apply(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None = None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
|
||||
tokenization_kwargs: Mapping[str, object] | None = None,
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
Process multi-modal inputs to be used in vLLM.
|
||||
@@ -189,29 +187,30 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
||||
Apply HF Processor on prompt text and multi-modal data together,
|
||||
outputting token IDs and processed tensors.
|
||||
"""
|
||||
if hf_processor_mm_kwargs is None:
|
||||
hf_processor_mm_kwargs = {}
|
||||
if tokenization_kwargs is None:
|
||||
tokenization_kwargs = {}
|
||||
prompt = inputs.prompt
|
||||
mm_items = inputs.mm_data_items
|
||||
hf_processor_mm_kwargs = inputs.hf_processor_mm_kwargs
|
||||
tokenization_kwargs = inputs.tokenization_kwargs
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
if not isinstance(prompt, str):
|
||||
# the prompt is the tokenized ids which is not supported
|
||||
# by the hf_processor, which is why we would need to decode the ids
|
||||
# into string
|
||||
prompt = hf_processor.decode(prompt)
|
||||
with timing_ctx.record("apply_hf_processor"):
|
||||
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
if not isinstance(prompt, str):
|
||||
# the prompt is the tokenized ids which is not supported
|
||||
# by the hf_processor, which is why we would need to decode the ids
|
||||
# into string
|
||||
prompt = hf_processor.decode(prompt)
|
||||
|
||||
# Bypass cached processor and always apply to the full set of mm inputs
|
||||
# NOTE: we can't just set caching=False because base class method
|
||||
# transforms outputs to `MultiModalKwargs` which is not going to
|
||||
# work for Transformers. We have a lot of logic tied to
|
||||
# `mm_tokens_per_modality` below
|
||||
prompt_ids, processed_data, _ = self._apply_hf_processor_text_mm(
|
||||
prompt_text=prompt,
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
# Bypass cached processor and always apply to the full set of mm inputs
|
||||
# NOTE: we can't just set caching=False because base class method
|
||||
# transforms outputs to `MultiModalKwargs` which is not going to
|
||||
# work for Transformers. We have a lot of logic tied to
|
||||
# `mm_tokens_per_modality` below
|
||||
prompt_ids, processed_data, _ = self._apply_hf_processor_text_mm(
|
||||
prompt_text=prompt,
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
# For gemma3 we check `token_type_ids` as the key
|
||||
token_type_key = (
|
||||
@@ -225,15 +224,14 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
||||
# it for each input `mm_data`.
|
||||
mm_positions = torch.where(mm_token_type_ids == 1)[1]
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
multimodal_config = self.info.ctx.model_config.multimodal_config
|
||||
mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
|
||||
image_sizes = []
|
||||
for item_idx in range(len(images)):
|
||||
image_size = images.get_image_size(item_idx)
|
||||
image_sizes.append((image_size.height, image_size.width))
|
||||
|
||||
mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
|
||||
image_sizes=image_sizes, **mm_processor_kwargs
|
||||
image_sizes=image_sizes,
|
||||
**self.info.ctx.get_merged_mm_kwargs({}),
|
||||
)
|
||||
|
||||
mm_placeholders = {}
|
||||
@@ -261,11 +259,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
||||
)
|
||||
|
||||
# Use overrides if provided; fallback to data-dependent hashing.
|
||||
mm_hashes = self._hash_mm_items(
|
||||
mm_items,
|
||||
mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
)
|
||||
with timing_ctx.record("get_mm_hashes"):
|
||||
mm_hashes = inputs.get_mm_hashes(self.info.model_id)
|
||||
|
||||
return mm_inputs(
|
||||
prompt_token_ids=prompt_ids,
|
||||
|
||||
@@ -47,16 +47,17 @@ from vllm.multimodal.parse import (
|
||||
AudioProcessorItems,
|
||||
MultiModalDataItems,
|
||||
MultiModalDataParser,
|
||||
MultiModalUUIDItems,
|
||||
)
|
||||
from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs
|
||||
from vllm.multimodal.processing import BaseDummyInputsBuilder
|
||||
from vllm.multimodal.processing.processor import (
|
||||
BaseMultiModalProcessor,
|
||||
BaseProcessingInfo,
|
||||
MultiModalProcessingInfo,
|
||||
PlaceholderFeaturesInfo,
|
||||
ProcessorInputs,
|
||||
PromptReplacement,
|
||||
PromptUpdate,
|
||||
TimingContext,
|
||||
)
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
@@ -265,13 +266,13 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
dummy_tokens = res.tokens
|
||||
|
||||
dummy_mm_inputs = self.info.parse_mm_data(
|
||||
dummy_mm_items = self.info.parse_mm_data(
|
||||
# whixtral tokenizer adds padding to the audio
|
||||
# so we need to update the audio arrays
|
||||
{**dummy_mm_data, "audio": [a.audio_array for a in res.audios]},
|
||||
)
|
||||
|
||||
return ProcessorInputs(prompt=dummy_tokens, mm_items=dummy_mm_inputs)
|
||||
return ProcessorInputs(prompt=dummy_tokens, mm_data_items=dummy_mm_items)
|
||||
|
||||
|
||||
class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]):
|
||||
@@ -361,19 +362,10 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
|
||||
|
||||
def _cached_apply_hf_processor(
|
||||
self,
|
||||
prompt: str | list[int],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
mm_uuid_items: MultiModalUUIDItems | None,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
inputs: ProcessorInputs,
|
||||
timing_ctx: TimingContext,
|
||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
mm_uuid_items=mm_uuid_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(inputs, timing_ctx)
|
||||
|
||||
# NOTE: The tokens are already inserted by the chat template
|
||||
return prompt_ids, mm_info, True
|
||||
|
||||
Reference in New Issue
Block a user