[Model] Support multiple images for qwen-vl (#8247)
Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -47,6 +47,7 @@ from vllm.multimodal.base import MultiModalInputs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
|
||||
SequenceData)
|
||||
from vllm.utils import is_list_of
|
||||
|
||||
from .utils import flatten_bn, is_pp_missing_parameter, make_layers
|
||||
|
||||
@@ -684,9 +685,12 @@ def input_processor_for_qwen(ctx: InputContext,
|
||||
raise ValueError(
|
||||
f"Expected img embeds to be have 3 dimensions, got {num_dims}")
|
||||
num_images = 1 if num_dims == 2 else image_data.shape[0]
|
||||
else:
|
||||
# TODO - handle multiple image inputs once the API is solidified
|
||||
elif isinstance(image_data, Image.Image):
|
||||
num_images = 1
|
||||
elif is_list_of(image_data, Image.Image):
|
||||
num_images = len(image_data)
|
||||
else:
|
||||
raise TypeError(f"Invalid image type: {type(image_data)}")
|
||||
|
||||
if prompt is None:
|
||||
prompt = tokenizer.decode(prompt_token_ids)
|
||||
@@ -767,11 +771,11 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
|
||||
f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but "
|
||||
f"received shape [{data.shape}]")
|
||||
pixel_values = data
|
||||
|
||||
else:
|
||||
transform = build_normalization_transform(image_size)
|
||||
# TODO - handle multiple image inputs once the API is solidified
|
||||
transformed_images = [transform(data)]
|
||||
if not isinstance(data, (list, tuple)):
|
||||
data = [data]
|
||||
transformed_images = [transform(datum) for datum in data]
|
||||
pixel_values = torch.stack(transformed_images, dim=0)
|
||||
return MultiModalInputs({"pixel_values": pixel_values})
|
||||
|
||||
|
||||
Reference in New Issue
Block a user