diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index dff73fe1f..d49cf8850 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -590,8 +590,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): def _process_image_input( self, image_input: AriaImagePixelInputs ) -> tuple[torch.Tensor, torch.Tensor]: - assert self.vision_tower is not None - pixel_values = image_input["pixel_values"] pixel_mask = image_input["pixel_mask"] diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index 0d2efd56f..99184f2b2 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -382,7 +382,6 @@ class AyaVisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsP def _process_image_input( self, image_input: AyaVisionImagePixelInputs, **kwargs ) -> list[torch.Tensor]: - assert self.vision_tower is not None pixel_values = image_input["pixel_values"] num_patches = image_input["num_patches"] image_features = self._image_pixels_to_features( diff --git a/vllm/model_executor/models/cohere2_vision.py b/vllm/model_executor/models/cohere2_vision.py index be9c3168e..6dd98c135 100644 --- a/vllm/model_executor/models/cohere2_vision.py +++ b/vllm/model_executor/models/cohere2_vision.py @@ -391,8 +391,6 @@ class Cohere2VisionForConditionalGeneration(nn.Module, SupportsMultiModal, Suppo Returns: List of flattened image embeddings, one per image """ - assert self.vision_tower is not None, "Vision tower is required" - pixel_values = image_input["pixel_values"] num_patches = image_input["num_patches"] diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 79cb1c508..27b26b532 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -1303,27 +1303,28 @@ class Ernie4_5_VLMoeForConditionalGeneration( self.config = config self.multimodal_config = multimodal_config - self.vision_model = Ernie4_5_VisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-6), - quant_config=quant_config, - multimodal_config=multimodal_config, - prefix=maybe_prefix(prefix, "vision_model"), - ) + with self._mark_tower_model(vllm_config, {"image", "video"}): + self.vision_model = Ernie4_5_VisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-6), + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=maybe_prefix(prefix, "vision_model"), + ) + self.resampler_model = VariableResolutionResamplerModel( + self.config.pixel_hidden_size, + self.config.hidden_size, + self.config.spatial_conv_size, + self.config.temporal_conv_size, + config=self.config, + prefix=maybe_prefix(prefix, "resampler_model"), + ) - self.language_model = Ernie4_5_VLMoeForCausalLM( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model"), - ) - - self.resampler_model = VariableResolutionResamplerModel( - self.config.pixel_hidden_size, - self.config.hidden_size, - self.config.spatial_conv_size, - self.config.temporal_conv_size, - config=self.config, - prefix=maybe_prefix(prefix, "resampler_model"), - ) + with self._mark_language_model(vllm_config): + self.language_model = Ernie4_5_VLMoeForCausalLM( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model"), + ) self.visual_token_mask = None self.make_empty_intermediate_tensors = ( @@ -1522,9 +1523,6 @@ class Ernie4_5_VLMoeForConditionalGeneration( mrope_position_delta = (llm_positions.max() + 1 - len(input_tokens)).item() return llm_positions, mrope_position_delta - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def _parse_and_validate_image_input( self, **kwargs: object ) -> Ernie4_5_VLImageInputs | None: diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 876e2645b..0733c2f51 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -287,16 +287,20 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.image_token_id = _IMAGE_TOKEN_ID self.image_feature_size = config.patch_size**2 * config.num_channels - self.vision_embed_tokens = ColumnParallelLinear( - self.image_feature_size, - config.hidden_size, - quant_config=quant_config, - gather_output=True, - ) - self.language_model = PersimmonForCausalLM( - vllm_config=vllm_config.with_hf_config(config.text_config), - prefix=maybe_prefix(prefix, "language_model"), - ) + with self._mark_tower_model(vllm_config, "image"): + self.vision_embed_tokens = ColumnParallelLinear( + self.image_feature_size, + config.hidden_size, + quant_config=quant_config, + gather_output=True, + ) + + with self._mark_language_model(vllm_config): + self.language_model = PersimmonForCausalLM( + vllm_config=vllm_config.with_hf_config(config.text_config), + prefix=maybe_prefix(prefix, "language_model"), + ) + self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors ) @@ -323,14 +327,10 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): image_patches_flat = image_input["image_patches_flat"] patches_per_image = image_input["patches_per_image"] - assert self.vision_embed_tokens is not None vision_embeddings_flat, _ = self.vision_embed_tokens(image_patches_flat) return vision_embeddings_flat.split(patches_per_image.tolist(), dim=0) - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: @@ -361,10 +361,7 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self, hidden_states: torch.Tensor, ) -> torch.Tensor | None: - logits = self.language_model.logits_processor( - self.language_model.lm_head, hidden_states - ) - return logits + return self.language_model.compute_logits(hidden_states) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 9c986f182..52ae40ba3 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -522,25 +522,27 @@ class Gemma3ForConditionalGeneration( self.quant_config = quant_config self.multimodal_config = multimodal_config - self.vision_tower = SiglipVisionModel( - config.vision_config, - quant_config, - prefix=maybe_prefix(prefix, "vision_tower"), - ) - self.multi_modal_projector = Gemma3MultiModalProjector(config) + with self._mark_tower_model(vllm_config, "image"): + self.vision_tower = SiglipVisionModel( + config.vision_config, + quant_config, + prefix=maybe_prefix(prefix, "vision_tower"), + ) + self.multi_modal_projector = Gemma3MultiModalProjector(config) - self.language_model = init_vllm_registered_model( - vllm_config=vllm_config, - hf_config=config.text_config, - prefix=maybe_prefix(prefix, "language_model"), - architectures=["Gemma3ForCausalLM"], - ) - logit_scale = getattr(config, "logit_scale", 1.0) + with self._mark_language_model(vllm_config): + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Gemma3ForCausalLM"], + ) - if hasattr(self.language_model, "logits_processor"): - # The logits processor can be unset if we're using - # automatic conversion to pooling model. - self.language_model.logits_processor.scale *= logit_scale + logit_scale = getattr(config, "logit_scale", 1.0) + if hasattr(self.language_model, "logits_processor"): + # The logits processor can be unset if we're using + # automatic conversion to pooling model. + self.language_model.logits_processor.scale *= logit_scale self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors @@ -579,8 +581,6 @@ class Gemma3ForConditionalGeneration( self, image_input: Gemma3ImageInputs, ) -> list[torch.Tensor]: - assert self.vision_tower is not None - pixel_values = image_input["pixel_values"] num_patches = image_input["num_patches"] @@ -592,9 +592,6 @@ class Gemma3ForConditionalGeneration( return [e.flatten(0, 1) for e in image_embeds.split(num_patches.tolist())] - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 14e3204dc..be520d117 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -503,31 +503,35 @@ class Gemma3nForConditionalGeneration( self.multimodal_config = multimodal_config self.vocab_size = config.text_config.vocab_size - self.vision_tower = AutoModel.from_config(config=config.vision_config) - self.audio_tower = AutoModel.from_config(config=config.audio_config) - self.embed_vision = Gemma3nMultimodalEmbedder( - config.vision_config, config.text_config - ) - self.embed_audio = Gemma3nMultimodalEmbedder( - config.audio_config, config.text_config - ) + with self._mark_tower_model(vllm_config, "image"): + self.vision_tower = AutoModel.from_config(config=config.vision_config) + self.embed_vision = Gemma3nMultimodalEmbedder( + config.vision_config, config.text_config + ) - self.language_model: nn.Module = init_vllm_registered_model( - vllm_config=vllm_config, - hf_config=config.text_config, - prefix=maybe_prefix(prefix, "language_model"), - architectures=["Gemma3nForCausalLM"], - ) - self.language_model = cast(Gemma3nForCausalLM, self.language_model) - # NOTE (NickLucche) In order to be compatible with cudagraph, the - # buffer needs to be consistent, so we pre-allocate here. - self.per_layer_embeddings = torch.zeros( - vllm_config.scheduler_config.max_num_batched_tokens, - self.config.text_config.num_hidden_layers, - self.config.text_config.hidden_size_per_layer_input, - device=self.language_model.model.embed_tokens.weight.device, - dtype=self.language_model.model.embed_tokens.weight.dtype, - ) + with self._mark_tower_model(vllm_config, "audio"): + self.audio_tower = AutoModel.from_config(config=config.audio_config) + self.embed_audio = Gemma3nMultimodalEmbedder( + config.audio_config, config.text_config + ) + + with self._mark_language_model(vllm_config): + self.language_model: Gemma3nForCausalLM = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["Gemma3nForCausalLM"], + ) + + # NOTE (NickLucche) In order to be compatible with cudagraph, the + # buffer needs to be consistent, so we pre-allocate here. + self.per_layer_embeddings = torch.zeros( + vllm_config.scheduler_config.max_num_batched_tokens, + self.config.text_config.num_hidden_layers, + self.config.text_config.hidden_size_per_layer_input, + device=self.language_model.model.embed_tokens.weight.device, + dtype=self.language_model.model.embed_tokens.weight.dtype, + ) def _parse_and_validate_image_input( self, **kwargs: object @@ -583,8 +587,6 @@ class Gemma3nForConditionalGeneration( self, image_input: Gemma3nImageInputs, ) -> list[torch.Tensor]: - assert self.vision_tower is not None - pixel_values = image_input["pixel_values"] vision_outputs = self.vision_tower( pixel_values=pixel_values, do_pooling=False, return_dict=True @@ -609,7 +611,6 @@ class Gemma3nForConditionalGeneration( self, audio_input: Gemma3nAudioInputs, ) -> list[torch.Tensor]: - assert self.audio_tower is not None # Run on padded features to enable batching input_features = audio_input["input_features_padded"].squeeze(1) input_features_mask = audio_input["input_features_mask"].squeeze(1) @@ -651,9 +652,6 @@ class Gemma3nForConditionalGeneration( # Return a list of embeddings instead of a batched tensor return audio_features.unbind(0) - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if mm_input_by_modality is None: diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 7cc55cf10..acb52f8d6 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1434,13 +1434,14 @@ class Glm4vForConditionalGeneration( self.multimodal_config = multimodal_config self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data" - self.visual = Glm4vVisionTransformer( - config.vision_config, - norm_eps=getattr(config, "rms_norm_eps", 1e-5), - quant_config=quant_config, - multimodal_config=multimodal_config, - prefix=maybe_prefix(prefix, "visual"), - ) + with self._mark_tower_model(vllm_config, {"image", "video"}): + self.visual = Glm4vVisionTransformer( + config.vision_config, + norm_eps=getattr(config, "rms_norm_eps", 1e-5), + quant_config=quant_config, + multimodal_config=multimodal_config, + prefix=maybe_prefix(prefix, "visual"), + ) if config.model_type == "glm4v": architectures = ["Glm4ForCausalLM"] @@ -1449,12 +1450,13 @@ class Glm4vForConditionalGeneration( else: architectures = None - self.language_model = init_vllm_registered_model( - vllm_config=vllm_config, - hf_config=config.text_config, - prefix=maybe_prefix(prefix, "language_model"), - architectures=architectures, - ) + with self._mark_language_model(vllm_config): + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=architectures, + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors @@ -1578,9 +1580,6 @@ class Glm4vForConditionalGeneration( ) return mm_input_by_modality - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if not mm_input_by_modality: diff --git a/vllm/model_executor/models/glmasr.py b/vllm/model_executor/models/glmasr.py index 9f9fda431..d17fe7fcb 100644 --- a/vllm/model_executor/models/glmasr.py +++ b/vllm/model_executor/models/glmasr.py @@ -944,26 +944,27 @@ class GlmAsrForConditionalGeneration( multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config - - # Use optimized vLLM native encoder - self.audio_tower = GlmAsrEncoder( - config.audio_config, - quant_config=quant_config, - prefix=maybe_prefix(prefix, "audio_tower"), - ) - self.multi_modal_projector = GlmAsrMultiModalProjector( - config, - quant_config=quant_config, - prefix=maybe_prefix(prefix, "multi_modal_projector"), - ) self.quant_config = quant_config - self.language_model = init_vllm_registered_model( - vllm_config=vllm_config, - hf_config=config.text_config, - prefix=maybe_prefix(prefix, "language_model"), - architectures=["LlamaForCausalLM"], - ) + with self._mark_tower_model(vllm_config, "audio"): + self.audio_tower = GlmAsrEncoder( + config.audio_config, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "audio_tower"), + ) + self.multi_modal_projector = GlmAsrMultiModalProjector( + config, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "multi_modal_projector"), + ) + + with self._mark_language_model(vllm_config): + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + architectures=["LlamaForCausalLM"], + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors @@ -1063,9 +1064,6 @@ class GlmAsrForConditionalGeneration( ) return _group_audio_embeddings(chunk_embeddings, chunk_counts) - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index dd0a00b56..ae9a6a211 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -597,27 +597,29 @@ class GraniteSpeechForConditionalGeneration( self.quant_config = quant_config self.cache_config = cache_config - # The language model is typically a Granite LLM - self.language_model = init_vllm_registered_model( - vllm_config=vllm_config, - hf_config=config.text_config, - prefix=maybe_prefix(prefix, "language_model"), - ) + with self._mark_language_model(vllm_config): + # The language model is typically a Granite LLM + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=config.text_config, + prefix=maybe_prefix(prefix, "language_model"), + ) - # Conformer encoder - self.encoder = GraniteSpeechCTCEncoder( - config=config.encoder_config, - quant_config=quant_config, - prefix=f"{prefix}.encoder", - ) + with self._mark_tower_model(vllm_config, "audio"): + # Conformer encoder + self.encoder = GraniteSpeechCTCEncoder( + config=config.encoder_config, + quant_config=quant_config, + prefix=f"{prefix}.encoder", + ) - # Blip2 QFormer - self.projector = GraniteSpeechEncoderProjector( - config=config, - quant_config=quant_config, - cache_config=cache_config, - prefix=f"{prefix}.projector", - ) + # Blip2 QFormer + self.projector = GraniteSpeechEncoderProjector( + config=config, + quant_config=quant_config, + cache_config=cache_config, + prefix=f"{prefix}.projector", + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors @@ -770,9 +772,6 @@ class GraniteSpeechForConditionalGeneration( # Split variable length features into a tuple return torch.split(masked_embeds, audio_input["audio_embed_sizes"]) - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal( self, **kwargs: object, diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 82ea7d1ec..ba79ba66c 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -877,7 +877,7 @@ class HunYuanVLForConditionalGeneration( self.config = config self.multimodal_config = multimodal_config - if multimodal_config.get_limit_per_prompt("image"): + with self._mark_tower_model(vllm_config, {"image"}): attn_backend_override = ( multimodal_config.mm_encoder_attn_backend if multimodal_config is not None @@ -890,17 +890,16 @@ class HunYuanVLForConditionalGeneration( multimodal_config=multimodal_config, attn_backend_override=attn_backend_override, ) - else: - self.visual = None - self.language_model = init_vllm_registered_model( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "language_model.model"), - architectures=[ - "HunYuanDenseV1ForCausalLM", - "HunYuanMoEV1ForCausalLM", - ], - ) + with self._mark_language_model(vllm_config): + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "language_model.model"), + architectures=[ + "HunYuanDenseV1ForCausalLM", + "HunYuanMoEV1ForCausalLM", + ], + ) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors @@ -970,9 +969,6 @@ class HunYuanVLForConditionalGeneration( ) return mm_input_by_modality - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: mm_input_by_modality = self._parse_and_validate_multimodal_inputs(**kwargs) if not mm_input_by_modality: diff --git a/vllm/model_executor/models/hyperclovax_vision.py b/vllm/model_executor/models/hyperclovax_vision.py index 7bc9691c1..215402d0d 100644 --- a/vllm/model_executor/models/hyperclovax_vision.py +++ b/vllm/model_executor/models/hyperclovax_vision.py @@ -15,7 +15,6 @@ from einops import rearrange from timm.layers import LayerNorm, LayerNorm2d from timm.models.regnet import RegStage from transformers import BatchFeature, CLIPVisionConfig, SiglipVisionConfig -from transformers.modeling_utils import no_init_weights from vllm.config import VllmConfig from vllm.config.multimodal import BaseDummyOptions, MultiModalConfig @@ -625,8 +624,7 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): config, vision_config ) - # init models & parameters - with no_init_weights(): # weight will be loaded in from_pretrained + with self._mark_tower_model(vllm_config, {"image", "video"}): self.vision_model = init_vision_tower_for_hcxvision( vision_config, quant_config=quant_config, @@ -635,20 +633,20 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): require_post_norm=False, prefix=maybe_prefix(prefix, "vision_model"), ) - self.mm_projector = self._init_mm_projector(config, text_config, vision_config) + self.mm_projector = self._init_mm_projector( + config, text_config, vision_config + ) - self.lm_head_vocab_size = getattr( - text_config, "padded_vocab_size", text_config.vocab_size - ) - self.language_model = init_vllm_registered_model( - vllm_config=vllm_config, - hf_config=text_config, - prefix=maybe_prefix(prefix, "language_model"), - ) + if config.anyres: + self.image_newline = nn.Parameter( + torch.empty(text_config.hidden_size, dtype=self.dtype) + ) - if config.anyres: - self.image_newline = nn.Parameter( - torch.empty(text_config.hidden_size, dtype=self.dtype) + with self._mark_language_model(vllm_config): + self.language_model = init_vllm_registered_model( + vllm_config=vllm_config, + hf_config=text_config, + prefix=maybe_prefix(prefix, "language_model"), ) self.config = config @@ -726,9 +724,6 @@ class HCXVisionForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): return modalities - def get_language_model(self) -> torch.nn.Module: - return self.language_model - def embed_multimodal( self, **kwargs: object,