[MM] Allow skipping memory profiling for multimodal models. (#22950)

Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
This commit is contained in:
Roger Wang
2025-08-15 04:41:38 -07:00
committed by GitHub
parent 3e6dd40016
commit 49252cf59e
4 changed files with 120 additions and 89 deletions

View File

@@ -388,6 +388,10 @@ class ModelConfig:
interleave_mm_strings: bool = False
"""Enable fully interleaved support for multimodal prompts, while using
--chat-template-content-format=string. Defaults to False."""
skip_mm_profiling: bool = False
"""When enabled, skips multimodal memory profiling and only profiles with
language backbone model during engine initialization.
"""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
@@ -837,7 +841,8 @@ class ModelConfig:
media_io_kwargs=self.media_io_kwargs,
mm_processor_kwargs=self.mm_processor_kwargs,
mm_processor_cache_gb=self.mm_processor_cache_gb,
interleave_mm_strings=self.interleave_mm_strings)
interleave_mm_strings=self.interleave_mm_strings,
skip_mm_profiling=self.skip_mm_profiling)
return None
@@ -2511,6 +2516,16 @@ class MultiModalConfig:
Enable fully interleaved support for multimodal prompts.
"""
skip_mm_profiling: bool = False
"""
When enabled, skips multimodal memory profiling and only profiles with
language backbone model during engine initialization.
This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and
embedding cache.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,