[Misc] Add fully interleaved support for multimodal 'string' content format (#14047)

Signed-off-by: drobyshev.anton <drobyshev.anton@wb.ru> Co-authored-by: drobyshev.anton <drobyshev.anton@wb.ru>
2025-07-07 22:43:08 +03:00
parent 22dd9c2730
commit e601efcb10
4 changed files with 478 additions and 43 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -346,6 +346,9 @@ class ModelConfig:
    limit_mm_per_prompt: dict[str, int] = field(default_factory=dict)
    """Maximum number of data items per modality per prompt. Only applicable
    for multimodal models."""
+    interleave_mm_strings: bool = False
+    """Enable fully interleaved support for multimodal prompts, while using 
+    --chat-template-content-format=string. Defaults to False."""
    media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
    """Additional args passed to process media inputs, keyed by modalities. 
    For example, to set num_frames for video, set 
@@ -702,7 +705,8 @@ class ModelConfig:
                media_io_kwargs=self.media_io_kwargs,
                mm_processor_kwargs=self.mm_processor_kwargs,
                disable_mm_preprocessor_cache=self.
-                disable_mm_preprocessor_cache)
+                disable_mm_preprocessor_cache,
+                interleave_mm_strings=self.interleave_mm_strings)

        if self.limit_mm_per_prompt:
            raise ValueError("`limit_mm_per_prompt` is only supported for "
@@ -713,6 +717,9 @@ class ModelConfig:
        if self.disable_mm_preprocessor_cache:
            raise ValueError("`disable_mm_preprocessor_cache` is only "
                             "supported for multimodal models.")
+        if self.interleave_mm_strings:
+            raise ValueError("`interleave_mm_strings` is only "
+                             "supported for multimodal models.")

        return None

@@ -3126,6 +3133,11 @@ class MultiModalConfig:
    If `True`, disable caching of the processed multi-modal inputs.
    """

+    interleave_mm_strings: bool = False
+    """
+    Enable fully interleaved support for multimodal prompts.
+    """
+
    def compute_hash(self) -> str:
        """
        WARNING: Whenever a new field is added to this config,