[Core][VLM] Stack multimodal tensors to represent multiple images within each prompt (#7902)

This commit is contained in:
Peter Salas
2024-08-27 18:53:56 -07:00
committed by GitHub
parent 9c71c97ae2
commit fab5f53e2d
15 changed files with 214 additions and 60 deletions

View File

@@ -555,6 +555,9 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal):
raise ValueError("Incorrect type of pixel values. "
f"Got type: {type(pixel_values)}")
# Remove the N dimension until multiple images are supported.
pixel_values = pixel_values.squeeze(1)
return Blip2ImagePixelInputs(
type="pixel_values",
data=self._validate_pixel_values(pixel_values),
@@ -564,6 +567,10 @@ class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal):
if not isinstance(image_embeds, torch.Tensor):
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")
# Remove the N dimension until multiple images are supported.
image_embeds = image_embeds.squeeze(1)
return Blip2ImageEmbeddingInputs(
type="image_embeds",
data=image_embeds,