[Misc] Add fully interleaved support for multimodal 'string' content format (#14047)

Signed-off-by: drobyshev.anton <drobyshev.anton@wb.ru>
Co-authored-by: drobyshev.anton <drobyshev.anton@wb.ru>
This commit is contained in:
Anton
2025-07-07 22:43:08 +03:00
committed by GitHub
parent 22dd9c2730
commit e601efcb10
4 changed files with 478 additions and 43 deletions

View File

@@ -346,6 +346,9 @@ class ModelConfig:
limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
"""Maximum number of data items per modality per prompt. Only applicable
for multimodal models."""
interleave_mm_strings: bool = False
"""Enable fully interleaved support for multimodal prompts, while using
--chat-template-content-format=string. Defaults to False."""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
@@ -702,7 +705,8 @@ class ModelConfig:
media_io_kwargs=self.media_io_kwargs,
mm_processor_kwargs=self.mm_processor_kwargs,
disable_mm_preprocessor_cache=self.
disable_mm_preprocessor_cache)
disable_mm_preprocessor_cache,
interleave_mm_strings=self.interleave_mm_strings)
if self.limit_mm_per_prompt:
raise ValueError("`limit_mm_per_prompt` is only supported for "
@@ -713,6 +717,9 @@ class ModelConfig:
if self.disable_mm_preprocessor_cache:
raise ValueError("`disable_mm_preprocessor_cache` is only "
"supported for multimodal models.")
if self.interleave_mm_strings:
raise ValueError("`interleave_mm_strings` is only "
"supported for multimodal models.")
return None
@@ -3126,6 +3133,11 @@ class MultiModalConfig:
If `True`, disable caching of the processed multi-modal inputs.
"""
interleave_mm_strings: bool = False
"""
Enable fully interleaved support for multimodal prompts.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,