[UX] Add --language-model-only for hybrid models (#34120)
Signed-off-by: Roger Wang <hey@rogerw.io>
This commit is contained in:
@@ -297,6 +297,7 @@ class ModelConfig:
|
||||
multimodal_config: MultiModalConfig | None = None
|
||||
"""Configuration for multimodal model. If `None`, this will be inferred
|
||||
from the architecture of `self.model`."""
|
||||
language_model_only: InitVar[bool] = False
|
||||
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
|
||||
enable_mm_embeds: InitVar[bool | None] = None
|
||||
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
|
||||
@@ -411,6 +412,7 @@ class ModelConfig:
|
||||
def __post_init__(
|
||||
self,
|
||||
# Multimodal config init vars
|
||||
language_model_only: bool,
|
||||
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
|
||||
enable_mm_embeds: bool | None,
|
||||
media_io_kwargs: dict[str, dict[str, Any]] | None,
|
||||
@@ -576,6 +578,7 @@ class ModelConfig:
|
||||
mm_encoder_tp_mode = "weights"
|
||||
|
||||
mm_config_kwargs = dict(
|
||||
language_model_only=language_model_only,
|
||||
limit_per_prompt=limit_mm_per_prompt,
|
||||
enable_mm_embeds=enable_mm_embeds,
|
||||
media_io_kwargs=media_io_kwargs,
|
||||
|
||||
@@ -54,8 +54,12 @@ DummyOptions: TypeAlias = (
|
||||
class MultiModalConfig:
|
||||
"""Controls the behavior of multimodal models."""
|
||||
|
||||
language_model_only: bool = False
|
||||
"""If True, disables all multimodal inputs by setting all modality limits
|
||||
to 0. Equivalent to setting --limit-mm-per-prompt to 0 for every
|
||||
modality."""
|
||||
limit_per_prompt: dict[str, DummyOptions] = Field(default_factory=dict)
|
||||
"""The maximum number of input items and options allowed per
|
||||
"""The maximum number of input items and options allowed per
|
||||
prompt for each modality.
|
||||
Defaults to 999 for each modality.
|
||||
|
||||
@@ -63,11 +67,11 @@ class MultiModalConfig:
|
||||
{"image": 16, "video": 2}
|
||||
|
||||
Configurable format (with options):
|
||||
{"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
|
||||
{"video": {"count": 1, "num_frames": 32, "width": 512, "height": 512},
|
||||
"image": {"count": 5, "width": 512, "height": 512}}
|
||||
|
||||
Mixed format (combining both):
|
||||
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
|
||||
{"image": 16, "video": {"count": 1, "num_frames": 32, "width": 512,
|
||||
"height": 512}}
|
||||
"""
|
||||
enable_mm_embeds: bool = False
|
||||
@@ -215,6 +219,7 @@ class MultiModalConfig:
|
||||
the final hidden states.
|
||||
"""
|
||||
factors: list[Any] = [
|
||||
self.language_model_only,
|
||||
self.mm_encoder_attn_backend.name
|
||||
if self.mm_encoder_attn_backend is not None
|
||||
else None,
|
||||
@@ -228,6 +233,9 @@ class MultiModalConfig:
|
||||
Get the maximum number of input items allowed per prompt
|
||||
for the given modality (backward compatible).
|
||||
"""
|
||||
if self.language_model_only:
|
||||
return 0
|
||||
|
||||
limit_data = self.limit_per_prompt.get(modality)
|
||||
|
||||
if limit_data is None:
|
||||
|
||||
Reference in New Issue
Block a user