[Core] Support image processor (#4197)
This commit is contained in:
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
from vllm.model_executor.models.llama import LlamaModel
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.image import get_dummy_image_data
|
||||
from vllm.sequence import SamplerOutput
|
||||
|
||||
from .vlm_base import VisionLanguageModelBase
|
||||
@@ -82,6 +84,9 @@ class LlavaImageFeatureInputs(TypedDict):
|
||||
LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
|
||||
|
||||
|
||||
@MULTIMODAL_REGISTRY.register_image_feature_input()
|
||||
@MULTIMODAL_REGISTRY.register_image_pixel_input()
|
||||
@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
|
||||
class LlavaForConditionalGeneration(VisionLanguageModelBase):
|
||||
|
||||
def __init__(self,
|
||||
@@ -131,30 +136,41 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
|
||||
return data
|
||||
|
||||
def _parse_and_validate_image_input(
|
||||
self, data: object) -> Optional[LlavaImageInputs]:
|
||||
self, **kwargs: object) -> Optional[LlavaImageInputs]:
|
||||
pixel_values = kwargs.pop("pixel_values", None)
|
||||
image_features = kwargs.pop("image_features", None)
|
||||
|
||||
expected_input_type = self.vision_language_config.image_input_type
|
||||
ImageInputType = VisionLanguageConfig.ImageInputType
|
||||
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
if expected_input_type == ImageInputType.PIXEL_VALUES:
|
||||
if not isinstance(data, torch.Tensor):
|
||||
raise TypeError("Image pixel vector should be a tensor, "
|
||||
f"but received type: {type(data)}")
|
||||
if image_features is not None:
|
||||
raise ValueError(
|
||||
"Expected pixel values but got image features")
|
||||
if pixel_values is None:
|
||||
return None
|
||||
|
||||
if not isinstance(pixel_values, torch.Tensor):
|
||||
raise ValueError("Incorrect type of pixel values")
|
||||
|
||||
return LlavaImagePixelInputs(
|
||||
type="pixel_values",
|
||||
data=self._validate_image_data(data),
|
||||
data=self._validate_image_data(pixel_values),
|
||||
)
|
||||
elif expected_input_type == ImageInputType.IMAGE_FEATURES:
|
||||
if not isinstance(data, torch.Tensor):
|
||||
raise TypeError("Image feature vector should be a tensor, "
|
||||
f"but received type: {type(data)}")
|
||||
|
||||
if expected_input_type == ImageInputType.IMAGE_FEATURES:
|
||||
if pixel_values is not None:
|
||||
raise ValueError(
|
||||
"Expected image features but got pixel values")
|
||||
if image_features is None:
|
||||
return None
|
||||
|
||||
if not isinstance(image_features, torch.Tensor):
|
||||
raise ValueError("Incorrect type of image features")
|
||||
|
||||
return LlavaImageFeatureInputs(
|
||||
type="image_features",
|
||||
data=self._validate_image_data(data),
|
||||
data=self._validate_image_data(image_features),
|
||||
)
|
||||
|
||||
return None
|
||||
@@ -201,12 +217,14 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
|
||||
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
def forward(self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
image_input: Optional[torch.Tensor] = None) -> SamplerOutput:
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
positions: torch.Tensor,
|
||||
kv_caches: List[torch.Tensor],
|
||||
attn_metadata: AttentionMetadata,
|
||||
**kwargs: object,
|
||||
) -> SamplerOutput:
|
||||
"""Run forward pass for Llava 1.5.
|
||||
|
||||
One key thing to understand is the `input_ids` already accounts for the
|
||||
@@ -227,10 +245,10 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
|
||||
This way, the `positions` and `attn_metadata` are consistent
|
||||
with the `input_ids`.
|
||||
|
||||
The model takes two types of image inputs:
|
||||
The model takes two types of image inputs:
|
||||
PIXEL_VALUES and IMAGE_FEATURES.
|
||||
The following shows how each maps to huggingface implementation.
|
||||
PIXEL_VALUES:
|
||||
PIXEL_VALUES:
|
||||
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
|
||||
IMAGE_FEATURES:
|
||||
- https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
|
||||
@@ -239,14 +257,15 @@ class LlavaForConditionalGeneration(VisionLanguageModelBase):
|
||||
Args:
|
||||
input_ids: Flattened (concatenated) input_ids corresponding to a
|
||||
batch.
|
||||
image_input: A batch of image inputs.
|
||||
For PIXEL_VALUES, expecting [1, 3, 336, 336].
|
||||
For IMAGE_FEATURES, expecting [1, 576, 1024].
|
||||
pixel_values: For PIXEL_VALUES, expects a batch with shape
|
||||
[1, 3, 336, 336].
|
||||
image_features: For IMAGE_FEATURES, expects a batch with shape
|
||||
[1, 576, 1024].
|
||||
"""
|
||||
parsed_image_input = self._parse_and_validate_image_input(image_input)
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
|
||||
if parsed_image_input is not None:
|
||||
vision_embeddings = self._process_image_input(parsed_image_input)
|
||||
if image_input is not None:
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
|
||||
inputs_embeds = _merge_vision_embeddings(
|
||||
|
||||
Reference in New Issue
Block a user