[Bugfix] Fix getting vision features in Transformer Multimodal backend (#32933)
Signed-off-by: raushan <raushan@huggingface.co>
This commit is contained in:
committed by
GitHub
parent
13d8746c54
commit
d95d650762
@@ -376,6 +376,15 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
|
||||
pixel_values, **kwargs
|
||||
)
|
||||
|
||||
# Transformers `v5`, `self.get_image_features` returns a tuple
|
||||
# containing the features and optionally attentions/hidden_states
|
||||
# After v5 is settled, we can enable qwen3-vl with several outputs
|
||||
# from `self.get_image_features`
|
||||
if isinstance(vision_embeddings, tuple):
|
||||
vision_embeddings = vision_embeddings[0]
|
||||
elif isinstance(vision_embeddings, dict):
|
||||
vision_embeddings = vision_embeddings.pooler_output
|
||||
|
||||
if isinstance(vision_embeddings, torch.Tensor):
|
||||
if vision_embeddings.ndim == 2:
|
||||
vision_embeddings = vision_embeddings.unsqueeze(0)
|
||||
|
||||
Reference in New Issue
Block a user