diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index f4cb701c0..effbafa50 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -397,6 +397,14 @@ VLM_TEST_SETTINGS = { vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, patch_hf_runner=model_utils.gemma3_patch_hf_runner, ), + "granite_vision": VLMTestInfo( + models=["ibm-granite/granite-vision-3.3-2b"], + test_type=(VLMTestType.IMAGE), + prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n", + max_model_len=8192, + auto_cls=AutoModelForImageTextToText, + vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output, + ), "glm4v": VLMTestInfo( models=["zai-org/glm-4v-9b"], test_type=VLMTestType.IMAGE, diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 6f0eef2d1..00a3aea61 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -124,8 +124,10 @@ def _llava_vllm_to_hf_output( if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id ] - assert output_str[0] == " " - hf_output_str = output_str[1:] + # output_str[0] is not " " in some cases, e.g., Granite Vision, + # but for most llava based models, this is the case + hf_output_str = output_str[1:] if output_str[0] == " " else output_str + if hf_output_ids[-1] == eos_token_id: hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) diff --git a/tests/models/registry.py b/tests/models/registry.py index 4b43c33e6..a96722d41 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -692,6 +692,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { trust_remote_code=True, min_transformers_version="5.0", ), + "GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"), "GraniteSpeechForConditionalGeneration": _HfExamplesInfo( "ibm-granite/granite-speech-3.3-2b" ), diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 631fa862f..0f7815936 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from collections.abc import Callable, Iterable, Mapping -from functools import cached_property +from functools import cached_property, partial from typing import Annotated, Literal import torch @@ -705,6 +705,7 @@ class SiglipVisionTransformer(nn.Module): num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, prefix: str = "", + use_head: bool | None = False, ) -> None: super().__init__() @@ -738,16 +739,30 @@ class SiglipVisionTransformer(nn.Module): else: self.post_layernorm = None - self.use_head = ( - True if not hasattr(config, "vision_use_head") else config.vision_use_head - ) - if self.use_head: - self.head = SiglipMultiheadAttentionPoolingHead( + # Fall back to the config if a bool is not provided explicitly; + # note that many config types, including SiglipVisionConfig, + # do not have vision_use_head as a defined attribute. + if isinstance(use_head, bool): + self.use_head = use_head + else: + self.use_head = ( + True + if not hasattr(config, "vision_use_head") + else config.vision_use_head + ) + + # Only create and load the head weights if we actually need them + self.head = ( + SiglipMultiheadAttentionPoolingHead( config=config, quant_config=quant_config, multimodal_config=multimodal_config, prefix=f"{prefix}.head", ) + if self.use_head + else None + ) + self.last_hs_proc = partial(self.maybe_layer_norm_and_apply_head) @property def dtype(self): @@ -776,23 +791,37 @@ class SiglipVisionTransformer(nn.Module): return_all_hidden_states=select_layers is not None, ) - if self.post_layernorm is not None: - encoder_outputs = self.post_layernorm(encoder_outputs) - - if self.use_head: - encoder_outputs = self.head(encoder_outputs) - - # stacks feature layers if needed + # In the case that we have multiple feature layers, + # we stack and concatenate them into a tensor. + # NOTE: post layer norm and the attention pooling head + # are handled by last_hs_proc, which runs before applying + # the vision feature selection strategy. encoder_outputs = resolve_visual_encoder_outputs( encoder_outputs, None, select_layers=select_layers, max_possible_layers=self.config.num_hidden_layers, + last_hs_proc=self.last_hs_proc, feature_select_strategy=feature_select_strategy, ) return encoder_outputs + def maybe_layer_norm_and_apply_head( + self, encoder_outputs: torch.Tensor + ) -> torch.Tensor: + """Apply the post layer norm and head if they are enabled, + given the last hidden states tensor. + + args: + encoder_outputs: The last hidden states from the visual encoder. + """ + if self.post_layernorm is not None: + encoder_outputs = self.post_layernorm(encoder_outputs) + if self.head is not None: + encoder_outputs = self.head(encoder_outputs) + return encoder_outputs + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -809,6 +838,11 @@ class SiglipVisionTransformer(nn.Module): if name.startswith("post_layernorm") and self.post_layernorm is None: continue + # if the model configuration is not going to use + # the pooling head for inference, don't load its weights + if self.head is None and name.startswith("head"): + continue + # omit layers when num_hidden_layers_override is set if name.startswith("encoder.layers"): layer_idx = int(name.split(".")[2]) @@ -841,6 +875,7 @@ class SiglipVisionModel(nn.Module): num_hidden_layers_override: int | None = None, require_post_norm: bool | None = None, prefix: str = "", + use_head: bool | None = False, ) -> None: super().__init__() @@ -852,6 +887,7 @@ class SiglipVisionModel(nn.Module): num_hidden_layers_override=num_hidden_layers_override, require_post_norm=require_post_norm, prefix=f"{prefix}.vision_model", + use_head=use_head, ) def get_input_embeddings(self) -> nn.Module: @@ -898,6 +934,11 @@ class SiglipVisionModel(nn.Module): ): continue + # if the model configuration is not going to use + # the pooling head for inference, don't load its weights + if self.vision_model.head is None and name.startswith("vision_model.head"): + continue + # omit layers when num_hidden_layers_override is set if name.startswith("vision_model.encoder.layers"): layer_idx = int(name.split(".")[3]) @@ -1048,6 +1089,7 @@ class SiglipEmbeddingModel(nn.Module, SupportsMultiModal, SupportsQuant): quant_config=quant_config, multimodal_config=multimodal_config, prefix=maybe_prefix(prefix, "vision_model"), + use_head=None, # Allows potential pooling head ) pooler_config = vllm_config.model_config.pooler_config diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 2a4bec774..f516a3d47 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -154,6 +154,7 @@ def resolve_visual_encoder_outputs( *, select_layers: list[int] | None = None, max_possible_layers: int | None = None, + last_hs_proc: Callable[[torch.Tensor], torch.Tensor] | None = None, feature_select_strategy: VisionFeatureSelectStrategy | None = None, ) -> torch.Tensor: """Given the outputs a visual encoder module that may correspond to the @@ -166,6 +167,11 @@ def resolve_visual_encoder_outputs( select_layers: Optional layer indices to grab from the encoder outputs; if provided, encoder outputs must be a list. max_possible_layers: Total layers in the fully loaded visual encoder. + last_hs_proc: Optional callable to be applied to the last layer if it + is used, e.g., pooling head for Siglip. This is done prior to + feature selection and layer normalization. If select_layers are + provided, the output of last_hs_proc must be able to be + concatenated with the other select_layers along the last dimension. feature_select_strategy: Defines how to select the hidden states from each layer. """ @@ -176,6 +182,11 @@ def resolve_visual_encoder_outputs( "`select_layers` is not provided" ) + # Preprocess the encoder outputs as needed, e.g., map head + # and layer norm for siglip, which runs before feature selection + if last_hs_proc is not None: + encoder_outputs = last_hs_proc(encoder_outputs) + if feature_select_strategy is not None: select_features = _get_vision_feature_selector(feature_select_strategy) encoder_outputs = select_features(encoder_outputs) @@ -205,12 +216,15 @@ def resolve_visual_encoder_outputs( for layer_idx in select_layers ] + uses_last_layer = select_layers[-1] in (max_possible_layers - 1, -1) + if last_hs_proc is not None and uses_last_layer: + hs_pool[-1] = last_hs_proc(hs_pool[-1]) + if feature_select_strategy is not None: select_features = _get_vision_feature_selector(feature_select_strategy) hs_pool = [select_features(hs) for hs in hs_pool] # Apply post-norm on the final hidden state if we are using it - uses_last_layer = select_layers[-1] in (max_possible_layers - 1, -1) if post_layer_norm is not None and uses_last_layer: hs_pool[-1] = post_layer_norm(hs_pool[-1])