[VLM] Remove image_input_type from VLM config (#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
@@ -35,10 +35,9 @@ from vllm.model_executor.models.clip import CLIPVisionModel
|
||||
from vllm.model_executor.models.llama import LlamaModel
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import ImagePixelData
|
||||
from vllm.sequence import SamplerOutput
|
||||
|
||||
from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
|
||||
from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
|
||||
from .interfaces import SupportsVision
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@@ -286,7 +285,7 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
|
||||
image_token_id=32044,
|
||||
image_feature_size_override=image_feature_size,
|
||||
)
|
||||
mm_data = dummy_pixel_data_for_clip(
|
||||
mm_data = dummy_image_for_clip(
|
||||
CLIP_VIT_LARGE_PATCH14_336_CONFIG,
|
||||
image_width_override=dummy_width,
|
||||
image_height_override=dummy_height,
|
||||
@@ -331,8 +330,7 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
|
||||
|
||||
|
||||
def _image_processor(ctx: InputContext,
|
||||
data: ImagePixelData) -> Dict[str, torch.Tensor]:
|
||||
image = data.image
|
||||
image: object) -> Dict[str, torch.Tensor]:
|
||||
|
||||
if isinstance(image, Image.Image):
|
||||
# Temporary patch before dynamic number of image tokens is supported
|
||||
@@ -343,13 +341,14 @@ def _image_processor(ctx: InputContext,
|
||||
"Dynamic image shape is currently not supported. "
|
||||
"Resizing input image to (%d, %d).", w, h)
|
||||
|
||||
data.image = image.resize((w, h))
|
||||
image = image.resize((w, h))
|
||||
|
||||
return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
|
||||
._default_input_mapper(ctx, data)
|
||||
return MULTIMODAL_REGISTRY._get_plugin("image") \
|
||||
._default_input_mapper(ctx, image)
|
||||
raise TypeError(f"Invalid type for 'image': {type(image)}")
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_image_processor)
|
||||
@MULTIMODAL_REGISTRY.register_image_input_mapper(_image_processor)
|
||||
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
|
||||
class Phi3VForCausalLM(nn.Module, SupportsVision):
|
||||
|
||||
@@ -375,14 +374,6 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
|
||||
pixel_values = kwargs.pop("pixel_values", None)
|
||||
image_sizes = kwargs.pop("image_sizes", None)
|
||||
|
||||
expected_input_type = self.vlm_config.image_input_type
|
||||
ImageInputType = VisionLanguageConfig.ImageInputType
|
||||
|
||||
if expected_input_type != ImageInputType.PIXEL_VALUES:
|
||||
raise ValueError(
|
||||
f"Unexpected image input type: {expected_input_type}."
|
||||
"Phi3v only support pixel_values input currently.")
|
||||
|
||||
if pixel_values is not None and image_sizes is not None:
|
||||
return Phi3VImagePixelInputs(type="pixel_values",
|
||||
data=pixel_values,
|
||||
|
||||
Reference in New Issue
Block a user