[Refactor] Decouple TimingContext from InputProcessingContext (#35083)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-02-23 22:15:50 +08:00
committed by GitHub
parent 1e8438a89a
commit 392645454b
38 changed files with 419 additions and 649 deletions

View File

@@ -41,15 +41,16 @@ from vllm.multimodal.parse import (
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
ProcessorInputs,
PromptIndexTargets,
PromptReplacement,
PromptUpdate,
TimingContext,
)
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -204,23 +205,20 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> MultiModalInputs:
if mm_items:
if isinstance(prompt, str):
if len(prompt) > 0:
if inputs.mm_data_items:
if isinstance(inputs.prompt, str):
if len(inputs.prompt) > 0:
raise ValueError(
"CLIP accepts text-only or image-only inputs, not both! "
"You must pass an image with an empty text prompt."
)
else:
special_tokens = self.info.get_tokenizer().all_special_ids
if all(tok in special_tokens for tok in prompt):
prompt = []
if all(tok in special_tokens for tok in inputs.prompt):
inputs.prompt = []
else:
raise ValueError(
"CLIP accepts text-only or image-only inputs, not both! "
@@ -229,18 +227,12 @@ class CLIPMultiModalProcessor(BaseMultiModalProcessor[CLIPProcessingInfo]):
# For multi-modal data, the prompt after processing should
# only contain the dummy image tokens
tokenization_kwargs = {
**(tokenization_kwargs or {}),
inputs.tokenization_kwargs = {
**inputs.tokenization_kwargs,
"add_special_tokens": False,
}
return super().apply(
prompt=prompt,
mm_items=mm_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
return super().apply(inputs, timing_ctx)
def _hf_processor_applies_updates(
self,

View File

@@ -30,15 +30,16 @@ from vllm.multimodal.parse import (
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import BaseDummyInputsBuilder
from vllm.multimodal.processing.processor import (
BaseMultiModalProcessor,
BaseProcessingInfo,
MultiModalProcessingInfo,
ProcessorInputs,
PromptReplacement,
PromptUpdate,
TimingContext,
)
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import cached_tokenizer_from_config
@@ -310,32 +311,17 @@ class DeepseekVL2MultiModalProcessor(
def _cached_apply_hf_processor(
self,
prompt: str | list[int],
mm_data_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
# The processor logic is different for len(images) <= 2 vs > 2
# Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only
# perform caching for the most common case
if mm_data_items.get_count("image", strict=False) > 2:
return self._apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
if inputs.mm_data_items.get_count("image", strict=False) > 2:
return self._apply_hf_processor(inputs, timing_ctx)
return super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
return super()._cached_apply_hf_processor(inputs, timing_ctx)
@MULTIMODAL_REGISTRY.register_processor(

View File

@@ -21,13 +21,14 @@ from vllm.multimodal.parse import (
ImageEmbeddingItems,
ImageProcessorItems,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing.processor import (
MultiModalProcessingInfo,
ProcessorInputs,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
TimingContext,
)
from vllm.tokenizers import TokenizerLike
@@ -490,32 +491,17 @@ class H2OVLMultiModalProcessor(BaseInternVLMultiModalProcessor[H2OVLProcessingIn
def _cached_apply_hf_processor(
self,
prompt: str | list[int],
mm_data_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
# The processor logic is different for len(images) <= 1 vs > 1
# Since the processing cache assumes that the processor output is
# invariant of how many images are passed per prompt, we only
# perform caching for the most common case
if mm_data_items.get_count("image", strict=False) > 1:
return self._apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
if inputs.mm_data_items.get_count("image", strict=False) > 1:
return self._apply_hf_processor(inputs, timing_ctx)
return super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
return super()._cached_apply_hf_processor(inputs, timing_ctx)
@MULTIMODAL_REGISTRY.register_processor(

View File

@@ -37,16 +37,17 @@ from vllm.multimodal.parse import (
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
InputProcessingContext,
ProcessorInputs,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
TimingContext,
)
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -770,11 +771,8 @@ class MantisProcessingInfo(LlavaProcessingInfo):
class MantisMultiModalProcessor(LlavaMultiModalProcessor):
def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> MultiModalInputs:
hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index
@@ -785,15 +783,9 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
image_height=-1,
)
result = super().apply(
prompt,
mm_items,
mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
result = super().apply(inputs, timing_ctx)
mm_item_counts = mm_items.get_all_counts()
mm_item_counts = inputs.mm_data_items.get_all_counts()
mm_kwargs = result["mm_kwargs"]
mm_hashes = result["mm_hashes"]
@@ -825,8 +817,8 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
)
orig_repls = self._get_mm_prompt_updates(
mm_items,
hf_processor_mm_kwargs,
inputs.mm_data_items,
inputs.hf_processor_mm_kwargs,
mm_kwargs,
)
mm_placeholders = self._find_mm_placeholders(prompt_ids, orig_repls)

View File

@@ -21,16 +21,17 @@ from vllm.multimodal.parse import (
ImageEmbeddingItems,
ImageProcessorItems,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
ProcessorInputs,
PromptIndexTargets,
PromptInsertion,
PromptUpdate,
PromptUpdateDetails,
TimingContext,
)
from vllm.renderers import TokenizeParams
from vllm.sequence import IntermediateTensors
@@ -228,19 +229,10 @@ class PaliGemmaMultiModalProcessor(BaseMultiModalProcessor[PaliGemmaProcessingIn
def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> MultiModalInputs:
mm_inputs = super().apply(
prompt,
mm_items,
mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
mm_inputs = super().apply(inputs, timing_ctx)
prompt_token_ids = mm_inputs["prompt_token_ids"]
tokenizer = self.info.get_tokenizer()

View File

@@ -50,16 +50,17 @@ from vllm.multimodal.parse import (
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs
from vllm.multimodal.processing import BaseDummyInputsBuilder
from vllm.multimodal.processing.processor import (
BaseMultiModalProcessor,
BaseProcessingInfo,
MultiModalProcessingInfo,
ProcessorInputs,
PromptReplacement,
PromptUpdate,
PromptUpdateDetails,
TimingContext,
)
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
@@ -277,7 +278,6 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
dummy_text = self.get_dummy_text(mm_counts)
dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts, mm_options)
dummy_images = dummy_mm_data.get("image", [])
tokenization_kwargs = {"truncation": False}
request = ChatCompletionRequest(
messages=[
@@ -294,11 +294,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
dummy_mm_items = self.info.parse_mm_data(dummy_mm_data)
return ProcessorInputs(
prompt=dummy_tokens,
mm_items=dummy_mm_items,
tokenization_kwargs=tokenization_kwargs,
)
return ProcessorInputs(prompt=dummy_tokens, mm_data_items=dummy_mm_items)
class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]):
@@ -344,19 +340,10 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo])
def _cached_apply_hf_processor(
self,
prompt: str | list[int],
mm_data_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(inputs, timing_ctx)
# NOTE: The tokens are already inserted by the chat template
return prompt_ids, mm_info, True

View File

@@ -47,15 +47,16 @@ from vllm.multimodal.parse import (
ImageProcessorItems,
ImageSize,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
ProcessorInputs,
PromptIndexTargets,
PromptReplacement,
PromptUpdate,
TimingContext,
)
from vllm.sequence import IntermediateTensors
from vllm.utils.tensor_schema import TensorSchema, TensorShape
@@ -190,23 +191,20 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> MultiModalInputs:
if mm_items:
if isinstance(prompt, str):
if len(prompt) > 0:
if inputs.mm_data_items:
if isinstance(inputs.prompt, str):
if len(inputs.prompt) > 0:
raise ValueError(
"SigLIP accepts text-only or image-only inputs, not both! "
"You must pass an image with an empty text prompt."
)
else:
special_tokens = self.info.get_tokenizer().all_special_ids
if all(tok in special_tokens for tok in prompt):
prompt = []
if all(tok in special_tokens for tok in inputs.prompt):
inputs.prompt = []
else:
raise ValueError(
"SigLIP accepts text-only or image-only inputs, not both! "
@@ -214,19 +212,13 @@ class SiglipMultiModalProcessor(BaseMultiModalProcessor[SiglipProcessingInfo]):
)
# For multi-modal data, the prompt after processing should
# only contain the image token
tokenization_kwargs = {
**(tokenization_kwargs or {}),
# only contain the dummy image tokens
inputs.tokenization_kwargs = {
**inputs.tokenization_kwargs,
"add_special_tokens": False,
}
return super().apply(
prompt=prompt,
mm_items=mm_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
return super().apply(inputs, timing_ctx)
def _hf_processor_applies_updates(
self,

View File

@@ -54,13 +54,14 @@ from vllm.multimodal.parse import (
ModalityDataItems,
MultiModalDataItems,
MultiModalDataParser,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
ProcessorInputs,
PromptUpdate,
TimingContext,
)
from vllm.sequence import IntermediateTensors
@@ -193,29 +194,21 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> MultiModalInputs:
if hf_processor_mm_kwargs is None:
hf_processor_mm_kwargs = {}
if tokenization_kwargs is None:
tokenization_kwargs = {}
mm_items = inputs.mm_data_items
hf_processor_mm_kwargs = inputs.hf_processor_mm_kwargs
mm_hashes = self._hash_mm_items(
mm_items,
mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
_, passthrough_data = self._get_hf_mm_data(mm_items)
mm_processed_data = BatchFeature(
{k: torch.as_tensor(v).unsqueeze(0) for k, v in passthrough_data.items()},
tensor_type="pt",
)
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
with timing_ctx.record("apply_hf_processor"):
_, passthrough_data = self._get_hf_mm_data(mm_items)
mm_processed_data = BatchFeature(
{
k: torch.as_tensor(v).unsqueeze(0)
for k, v in passthrough_data.items()
},
tensor_type="pt",
)
mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
mm_processed_data,
@@ -226,6 +219,11 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
),
)
with timing_ctx.record("get_mm_hashes"):
mm_hashes = inputs.get_mm_hashes(self.info.model_id)
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
return mm_inputs(
prompt_token_ids=[1],
mm_kwargs=mm_kwargs,

View File

@@ -37,12 +37,13 @@ from vllm.multimodal.inputs import (
from vllm.multimodal.parse import (
ImageProcessorItems,
MultiModalDataItems,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import (
BaseDummyInputsBuilder,
BaseMultiModalProcessor,
BaseProcessingInfo,
ProcessorInputs,
TimingContext,
)
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors
@@ -177,11 +178,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
def apply(
self,
prompt: str | list[int],
mm_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None = None,
hf_processor_mm_kwargs: Mapping[str, object] | None = None,
tokenization_kwargs: Mapping[str, object] | None = None,
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> MultiModalInputs:
"""
Process multi-modal inputs to be used in vLLM.
@@ -189,29 +187,30 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
Apply HF Processor on prompt text and multi-modal data together,
outputting token IDs and processed tensors.
"""
if hf_processor_mm_kwargs is None:
hf_processor_mm_kwargs = {}
if tokenization_kwargs is None:
tokenization_kwargs = {}
prompt = inputs.prompt
mm_items = inputs.mm_data_items
hf_processor_mm_kwargs = inputs.hf_processor_mm_kwargs
tokenization_kwargs = inputs.tokenization_kwargs
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if not isinstance(prompt, str):
# the prompt is the tokenized ids which is not supported
# by the hf_processor, which is why we would need to decode the ids
# into string
prompt = hf_processor.decode(prompt)
with timing_ctx.record("apply_hf_processor"):
hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
if not isinstance(prompt, str):
# the prompt is the tokenized ids which is not supported
# by the hf_processor, which is why we would need to decode the ids
# into string
prompt = hf_processor.decode(prompt)
# Bypass cached processor and always apply to the full set of mm inputs
# NOTE: we can't just set caching=False because base class method
# transforms outputs to `MultiModalKwargs` which is not going to
# work for Transformers. We have a lot of logic tied to
# `mm_tokens_per_modality` below
prompt_ids, processed_data, _ = self._apply_hf_processor_text_mm(
prompt_text=prompt,
mm_items=mm_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
# Bypass cached processor and always apply to the full set of mm inputs
# NOTE: we can't just set caching=False because base class method
# transforms outputs to `MultiModalKwargs` which is not going to
# work for Transformers. We have a lot of logic tied to
# `mm_tokens_per_modality` below
prompt_ids, processed_data, _ = self._apply_hf_processor_text_mm(
prompt_text=prompt,
mm_items=mm_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
# For gemma3 we check `token_type_ids` as the key
token_type_key = (
@@ -225,15 +224,14 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
# it for each input `mm_data`.
mm_positions = torch.where(mm_token_type_ids == 1)[1]
images = mm_items.get_items("image", ImageProcessorItems)
multimodal_config = self.info.ctx.model_config.multimodal_config
mm_processor_kwargs = multimodal_config.mm_processor_kwargs or {}
image_sizes = []
for item_idx in range(len(images)):
image_size = images.get_image_size(item_idx)
image_sizes.append((image_size.height, image_size.width))
mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(
image_sizes=image_sizes, **mm_processor_kwargs
image_sizes=image_sizes,
**self.info.ctx.get_merged_mm_kwargs({}),
)
mm_placeholders = {}
@@ -261,11 +259,8 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
)
# Use overrides if provided; fallback to data-dependent hashing.
mm_hashes = self._hash_mm_items(
mm_items,
mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
)
with timing_ctx.record("get_mm_hashes"):
mm_hashes = inputs.get_mm_hashes(self.info.model_id)
return mm_inputs(
prompt_token_ids=prompt_ids,

View File

@@ -47,16 +47,17 @@ from vllm.multimodal.parse import (
AudioProcessorItems,
MultiModalDataItems,
MultiModalDataParser,
MultiModalUUIDItems,
)
from vllm.multimodal.processing import BaseDummyInputsBuilder, ProcessorInputs
from vllm.multimodal.processing import BaseDummyInputsBuilder
from vllm.multimodal.processing.processor import (
BaseMultiModalProcessor,
BaseProcessingInfo,
MultiModalProcessingInfo,
PlaceholderFeaturesInfo,
ProcessorInputs,
PromptReplacement,
PromptUpdate,
TimingContext,
)
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import cached_tokenizer_from_config
@@ -265,13 +266,13 @@ class VoxtralDummyInputsBuilder(BaseDummyInputsBuilder[VoxtralProcessingInfo]):
res = tokenizer.mistral.encode_chat_completion(request)
dummy_tokens = res.tokens
dummy_mm_inputs = self.info.parse_mm_data(
dummy_mm_items = self.info.parse_mm_data(
# whixtral tokenizer adds padding to the audio
# so we need to update the audio arrays
{**dummy_mm_data, "audio": [a.audio_array for a in res.audios]},
)
return ProcessorInputs(prompt=dummy_tokens, mm_items=dummy_mm_inputs)
return ProcessorInputs(prompt=dummy_tokens, mm_data_items=dummy_mm_items)
class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]):
@@ -361,19 +362,10 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo])
def _cached_apply_hf_processor(
self,
prompt: str | list[int],
mm_data_items: MultiModalDataItems,
mm_uuid_items: MultiModalUUIDItems | None,
hf_processor_mm_kwargs: Mapping[str, object],
tokenization_kwargs: Mapping[str, object],
inputs: ProcessorInputs,
timing_ctx: TimingContext,
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
prompt=prompt,
mm_data_items=mm_data_items,
mm_uuid_items=mm_uuid_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
tokenization_kwargs=tokenization_kwargs,
)
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(inputs, timing_ctx)
# NOTE: The tokens are already inserted by the chat template
return prompt_ids, mm_info, True