[Core][VLM] Stack multimodal tensors to represent multiple images within each prompt (#7902)

2024-08-27 18:53:56 -07:00
parent 9c71c97ae2
commit fab5f53e2d
15 changed files with 214 additions and 60 deletions
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -249,6 +249,9 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal):
        image_patches = kwargs.pop("image_patches", None)

        if isinstance(image_patches, torch.Tensor):
+            # Remove the N dimension until multiple images are supported.
+            image_patches = image_patches.squeeze(1)
+
            expected_feature_size = self.image_feature_size
            if image_patches.size(-1) != expected_feature_size:
                raise ValueError(