[Model] Move vision_feature_select_strategy into resolve_visual_encoder_outputs (#25938)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -235,12 +235,12 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
# Determine the layer up to which we will initialize the vision tower
|
||||
if isinstance(vision_feature_layer, int):
|
||||
vision_hidden_size = config.vision_config.hidden_size
|
||||
self.feature_sample_layers = None
|
||||
self.select_layers = None
|
||||
# Used for multimodal granite models to control encoder outputs
|
||||
elif isinstance(vision_feature_layer, (list, tuple)):
|
||||
vision_hidden_size = config.vision_config.hidden_size * len(
|
||||
vision_feature_layer)
|
||||
self.feature_sample_layers = vision_feature_layer
|
||||
self.select_layers = vision_feature_layer
|
||||
else:
|
||||
raise TypeError(
|
||||
f"vision_layer_feature type: {type(vision_feature_layer)}"
|
||||
@@ -312,30 +312,17 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
|
||||
|
||||
raise AssertionError("This line should be unreachable.")
|
||||
|
||||
def _select_image_features(self, image_features: torch.Tensor, *,
|
||||
strategy: str) -> torch.Tensor:
|
||||
# Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa
|
||||
if strategy == "default":
|
||||
return image_features[:, 1:]
|
||||
elif strategy == "full":
|
||||
return image_features
|
||||
|
||||
raise ValueError(f"Unexpected select feature strategy: {strategy}")
|
||||
|
||||
def _image_pixels_to_features(
|
||||
self,
|
||||
vision_tower: Union[CLIPVisionModel, SiglipVisionModel],
|
||||
pixel_values: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
|
||||
# NOTE: we skip the step to select the vision feature layer since
|
||||
# this is already done inside the vision tower
|
||||
image_features = vision_tower(
|
||||
pixel_values, feature_sample_layers=self.feature_sample_layers)
|
||||
|
||||
return self._select_image_features(
|
||||
image_features,
|
||||
strategy=self.config.vision_feature_select_strategy,
|
||||
return vision_tower(
|
||||
pixel_values,
|
||||
select_layers=self.select_layers,
|
||||
feature_select_strategy=self.config.vision_feature_select_strategy,
|
||||
)
|
||||
|
||||
# Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
|
||||
|
||||
Reference in New Issue
Block a user