[doc] Fold long code blocks to improve readability (#19926)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
This commit is contained in:
@@ -25,59 +25,63 @@ Further update the model as follows:
|
||||
|
||||
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
|
||||
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
??? Code
|
||||
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
```python
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
|
||||
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
assert self.vision_encoder is not None
|
||||
image_features = self.vision_encoder(image_input)
|
||||
return self.multi_modal_projector(image_features)
|
||||
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return None
|
||||
def get_multimodal_embeddings(
|
||||
self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
|
||||
|
||||
# Run multimodal inputs through encoder and projector
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
return vision_embeddings
|
||||
```
|
||||
# Validate the multimodal input keyword arguments
|
||||
image_input = self._parse_and_validate_image_input(**kwargs)
|
||||
if image_input is None:
|
||||
return None
|
||||
|
||||
# Run multimodal inputs through encoder and projector
|
||||
vision_embeddings = self._process_image_input(image_input)
|
||||
return vision_embeddings
|
||||
```
|
||||
|
||||
!!! important
|
||||
The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
|
||||
|
||||
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||
|
||||
```python
|
||||
from .utils import merge_multimodal_embeddings
|
||||
??? Code
|
||||
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
```python
|
||||
from .utils import merge_multimodal_embeddings
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> torch.Tensor:
|
||||
class YourModelForImage2Seq(nn.Module):
|
||||
...
|
||||
|
||||
# `get_input_embeddings` should already be implemented for the language
|
||||
# model as one of the requirements of basic vLLM model implementation.
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
input_ids: torch.Tensor,
|
||||
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
|
||||
) -> torch.Tensor:
|
||||
|
||||
if multimodal_embeddings is not None:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids=input_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
multimodal_embeddings=multimodal_embeddings,
|
||||
placeholder_token_id=self.config.image_token_index)
|
||||
# `get_input_embeddings` should already be implemented for the language
|
||||
# model as one of the requirements of basic vLLM model implementation.
|
||||
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
|
||||
|
||||
return inputs_embeds
|
||||
```
|
||||
if multimodal_embeddings is not None:
|
||||
inputs_embeds = merge_multimodal_embeddings(
|
||||
input_ids=input_ids,
|
||||
inputs_embeds=inputs_embeds,
|
||||
multimodal_embeddings=multimodal_embeddings,
|
||||
placeholder_token_id=self.config.image_token_index)
|
||||
|
||||
return inputs_embeds
|
||||
```
|
||||
|
||||
- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
|
||||
|
||||
@@ -135,42 +139,46 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||
??? Code
|
||||
|
||||
if n_image_tokens != n_image_features:
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
||||
n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
|
||||
n_image_features = image_features.shape[0] * image_features.shape[1]
|
||||
|
||||
if n_image_tokens != n_image_features:
|
||||
raise ValueError(
|
||||
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
|
||||
)
|
||||
special_image_mask = (
|
||||
(input_ids == self.config.image_token_index)
|
||||
.unsqueeze(-1)
|
||||
.expand_as(inputs_embeds)
|
||||
.to(inputs_embeds.device)
|
||||
)
|
||||
special_image_mask = (
|
||||
(input_ids == self.config.image_token_index)
|
||||
.unsqueeze(-1)
|
||||
.expand_as(inputs_embeds)
|
||||
.to(inputs_embeds.device)
|
||||
)
|
||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||
```
|
||||
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
||||
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
||||
```
|
||||
|
||||
The number of placeholder feature tokens per image is `image_features.shape[1]`.
|
||||
`image_features` is calculated inside the `get_image_features` method:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
||||
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
??? Code
|
||||
|
||||
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_image_feature = selected_image_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_image_feature = selected_image_feature
|
||||
else:
|
||||
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
||||
image_features = self.multi_modal_projector(selected_image_feature)
|
||||
return image_features
|
||||
```
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
||||
image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
|
||||
|
||||
selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
|
||||
if vision_feature_select_strategy == "default":
|
||||
selected_image_feature = selected_image_feature[:, 1:]
|
||||
elif vision_feature_select_strategy == "full":
|
||||
selected_image_feature = selected_image_feature
|
||||
else:
|
||||
raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
|
||||
image_features = self.multi_modal_projector(selected_image_feature)
|
||||
return image_features
|
||||
```
|
||||
|
||||
We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
|
||||
(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
|
||||
@@ -193,20 +201,22 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
||||
target_dtype = self.patch_embedding.weight.dtype
|
||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
||||
??? Code
|
||||
|
||||
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
|
||||
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
||||
if interpolate_pos_encoding:
|
||||
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
||||
else:
|
||||
embeddings = embeddings + self.position_embedding(self.position_ids)
|
||||
return embeddings
|
||||
```
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
||||
target_dtype = self.patch_embedding.weight.dtype
|
||||
patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
|
||||
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
||||
|
||||
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
|
||||
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
||||
if interpolate_pos_encoding:
|
||||
embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
|
||||
else:
|
||||
embeddings = embeddings + self.position_embedding(self.position_ids)
|
||||
return embeddings
|
||||
```
|
||||
|
||||
We can infer that `embeddings.shape[1] == self.num_positions`, where
|
||||
|
||||
@@ -218,55 +228,59 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
Overall, the number of placeholder feature tokens for an image can be calculated as:
|
||||
|
||||
```python
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
hf_processor = self.get_hf_processor()
|
||||
??? Code
|
||||
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
```python
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
hf_config = self.get_hf_config()
|
||||
hf_processor = self.get_hf_processor()
|
||||
|
||||
num_image_tokens = (image_size // patch_size) ** 2 + 1
|
||||
if hf_processor.vision_feature_select_strategy == "default":
|
||||
num_image_tokens -= 1
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
|
||||
return num_image_tokens
|
||||
```
|
||||
num_image_tokens = (image_size // patch_size) ** 2 + 1
|
||||
if hf_processor.vision_feature_select_strategy == "default":
|
||||
num_image_tokens -= 1
|
||||
|
||||
return num_image_tokens
|
||||
```
|
||||
|
||||
Notice that the number of image tokens doesn't depend on the image width and height.
|
||||
We can simply use a dummy `image_size` to calculate the multimodal profiling data:
|
||||
|
||||
```python
|
||||
# NOTE: In actuality, this is usually implemented as part of the
|
||||
# model's subclass of `BaseProcessingInfo`, but we show it as is
|
||||
# here for simplicity.
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
hf_config = self.get_hf_config()
|
||||
width = height = hf_config.image_size
|
||||
return ImageSize(width=width, height=height)
|
||||
??? Code
|
||||
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
```python
|
||||
# NOTE: In actuality, this is usually implemented as part of the
|
||||
# model's subclass of `BaseProcessingInfo`, but we show it as is
|
||||
# here for simplicity.
|
||||
def get_image_size_with_most_features(self) -> ImageSize:
|
||||
hf_config = self.get_hf_config()
|
||||
width = height = hf_config.image_size
|
||||
return ImageSize(width=width, height=height)
|
||||
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
|
||||
|
||||
@@ -284,21 +298,23 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
Looking at the code of HF's `FuyuForCausalLM`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
||||
if image_patches is not None and past_key_values is None:
|
||||
patch_embeddings = [
|
||||
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
|
||||
.squeeze(0)
|
||||
.to(inputs_embeds.device)
|
||||
for patch in image_patches
|
||||
]
|
||||
inputs_embeds = self.gather_continuous_embeddings(
|
||||
word_embeddings=inputs_embeds,
|
||||
continuous_embeddings=patch_embeddings,
|
||||
image_patch_input_indices=image_patches_indices,
|
||||
)
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
||||
if image_patches is not None and past_key_values is None:
|
||||
patch_embeddings = [
|
||||
self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
|
||||
.squeeze(0)
|
||||
.to(inputs_embeds.device)
|
||||
for patch in image_patches
|
||||
]
|
||||
inputs_embeds = self.gather_continuous_embeddings(
|
||||
word_embeddings=inputs_embeds,
|
||||
continuous_embeddings=patch_embeddings,
|
||||
image_patch_input_indices=image_patches_indices,
|
||||
)
|
||||
```
|
||||
|
||||
The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
|
||||
which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
|
||||
@@ -312,92 +328,98 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
|
||||
returning the dimensions after resizing (but before padding) as metadata.
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
||||
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
|
||||
batch_images = image_encoding["images"]
|
||||
image_unpadded_heights = image_encoding["image_unpadded_heights"]
|
||||
image_unpadded_widths = image_encoding["image_unpadded_widths"]
|
||||
??? Code
|
||||
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
|
||||
if do_resize:
|
||||
batch_images = [
|
||||
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
|
||||
for images in batch_images
|
||||
]
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
||||
image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
|
||||
batch_images = image_encoding["images"]
|
||||
image_unpadded_heights = image_encoding["image_unpadded_heights"]
|
||||
image_unpadded_widths = image_encoding["image_unpadded_widths"]
|
||||
|
||||
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
|
||||
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
|
||||
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
|
||||
|
||||
if do_pad:
|
||||
batch_images = [
|
||||
[
|
||||
self.pad_image(
|
||||
image,
|
||||
size=size,
|
||||
mode=padding_mode,
|
||||
constant_values=padding_value,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
for image in images
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
|
||||
if do_resize:
|
||||
batch_images = [
|
||||
[self.resize(image, size=size, input_data_format=input_data_format) for image in images]
|
||||
for images in batch_images
|
||||
]
|
||||
for images in batch_images
|
||||
]
|
||||
```
|
||||
|
||||
image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
|
||||
image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
|
||||
image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
|
||||
|
||||
if do_pad:
|
||||
batch_images = [
|
||||
[
|
||||
self.pad_image(
|
||||
image,
|
||||
size=size,
|
||||
mode=padding_mode,
|
||||
constant_values=padding_value,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
for images in batch_images
|
||||
]
|
||||
```
|
||||
|
||||
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
??? Code
|
||||
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
|
||||
image_height, image_width = image.shape[1], image.shape[2]
|
||||
if variable_sized: # variable_sized=True
|
||||
new_h = min(
|
||||
image_height,
|
||||
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
new_w = min(
|
||||
image_width,
|
||||
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
|
||||
)
|
||||
image = image[:, :new_h, :new_w]
|
||||
image_height, image_width = new_h, new_w
|
||||
|
||||
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
|
||||
tensor_of_image_ids = torch.full(
|
||||
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
||||
)
|
||||
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
|
||||
assert num_patches == patches.shape[0]
|
||||
```
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
|
||||
image_height, image_width = image.shape[1], image.shape[2]
|
||||
if variable_sized: # variable_sized=True
|
||||
new_h = min(
|
||||
image_height,
|
||||
math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
|
||||
)
|
||||
new_w = min(
|
||||
image_width,
|
||||
math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
|
||||
)
|
||||
image = image[:, :new_h, :new_w]
|
||||
image_height, image_width = new_h, new_w
|
||||
|
||||
num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
|
||||
tensor_of_image_ids = torch.full(
|
||||
[num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
|
||||
)
|
||||
patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
|
||||
assert num_patches == patches.shape[0]
|
||||
```
|
||||
|
||||
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
|
||||
??? Code
|
||||
|
||||
if image_height % patch_height != 0:
|
||||
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
|
||||
if image_width % patch_width != 0:
|
||||
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
||||
patch_size = patch_size if patch_size is not None else self.patch_size
|
||||
patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
|
||||
|
||||
num_patches_per_dim_h = image_height // patch_height
|
||||
num_patches_per_dim_w = image_width // patch_width
|
||||
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
||||
```
|
||||
if image_height % patch_height != 0:
|
||||
raise ValueError(f"{image_height=} must be divisible by {patch_height}")
|
||||
if image_width % patch_width != 0:
|
||||
raise ValueError(f"{image_width=} must be divisible by {patch_width}")
|
||||
|
||||
num_patches_per_dim_h = image_height // patch_height
|
||||
num_patches_per_dim_w = image_width // patch_width
|
||||
num_patches = num_patches_per_dim_h * num_patches_per_dim_w
|
||||
```
|
||||
|
||||
These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
|
||||
to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
|
||||
@@ -419,23 +441,25 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
||||
|
||||
For the multimodal image profiling data, the logic is very similar to LLaVA:
|
||||
|
||||
```python
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
??? Code
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
```python
|
||||
def get_dummy_mm_data(
|
||||
self,
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int],
|
||||
) -> MultiModalDataDict:
|
||||
target_width, target_height = \
|
||||
self.info.get_image_size_with_most_features()
|
||||
num_images = mm_counts.get("image", 0)
|
||||
|
||||
return {
|
||||
"image":
|
||||
self._get_dummy_images(width=target_width,
|
||||
height=target_height,
|
||||
num_images=num_images)
|
||||
}
|
||||
```
|
||||
|
||||
## 4. Specify processing details
|
||||
|
||||
@@ -455,6 +479,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
The output of `CLIPImageProcessor` is a simple tensor with shape
|
||||
`(num_images, num_channels, image_height, image_width)`:
|
||||
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
|
||||
images = [
|
||||
@@ -505,35 +530,37 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
|
||||
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
|
||||
|
||||
```python
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
)
|
||||
??? Code
|
||||
|
||||
image_patches = processed_outputs.get("image_patches")
|
||||
if image_patches is not None:
|
||||
images = mm_data["images"]
|
||||
assert isinstance(images, list)
|
||||
```python
|
||||
def _call_hf_processor(
|
||||
self,
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
)
|
||||
|
||||
# Original output: (1, num_images, Pn, Px * Py * C)
|
||||
# New output: (num_images, Pn, Px * Py * C)
|
||||
assert (isinstance(image_patches, list)
|
||||
and len(image_patches) == 1)
|
||||
assert (isinstance(image_patches[0], torch.Tensor)
|
||||
and len(image_patches[0]) == len(images))
|
||||
image_patches = processed_outputs.get("image_patches")
|
||||
if image_patches is not None:
|
||||
images = mm_data["images"]
|
||||
assert isinstance(images, list)
|
||||
|
||||
processed_outputs["image_patches"] = image_patches[0]
|
||||
# Original output: (1, num_images, Pn, Px * Py * C)
|
||||
# New output: (num_images, Pn, Px * Py * C)
|
||||
assert (isinstance(image_patches, list)
|
||||
and len(image_patches) == 1)
|
||||
assert (isinstance(image_patches[0], torch.Tensor)
|
||||
and len(image_patches[0]) == len(images))
|
||||
|
||||
return processed_outputs
|
||||
```
|
||||
processed_outputs["image_patches"] = image_patches[0]
|
||||
|
||||
return processed_outputs
|
||||
```
|
||||
|
||||
!!! note
|
||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||
@@ -573,35 +600,37 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
||||
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
|
||||
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
??? Code
|
||||
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
image_token_id = hf_config.image_token_index
|
||||
|
||||
image_size = images.get_image_size(item_idx)
|
||||
num_image_tokens = self.info.get_num_image_tokens(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
|
||||
return [image_token_id] * num_image_tokens
|
||||
image_size = images.get_image_size(item_idx)
|
||||
num_image_tokens = self.info.get_num_image_tokens(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[image_token_id],
|
||||
replacement=get_replacement,
|
||||
),
|
||||
]
|
||||
```
|
||||
return [image_token_id] * num_image_tokens
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[image_token_id],
|
||||
replacement=get_replacement,
|
||||
),
|
||||
]
|
||||
```
|
||||
|
||||
=== "Handling additional tokens: Fuyu"
|
||||
|
||||
@@ -616,117 +645,90 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
|
||||
We define a helper function to return `ncols` and `nrows` directly:
|
||||
|
||||
```python
|
||||
def get_image_feature_grid_size(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> tuple[int, int]:
|
||||
image_processor = self.get_image_processor()
|
||||
target_width = image_processor.size["width"]
|
||||
target_height = image_processor.size["height"]
|
||||
patch_width = image_processor.patch_size["width"]
|
||||
patch_height = image_processor.patch_size["height"]
|
||||
??? Code
|
||||
|
||||
if not (image_width <= target_width and image_height <= target_height):
|
||||
height_scale_factor = target_height / image_height
|
||||
width_scale_factor = target_width / image_width
|
||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||
```python
|
||||
def get_image_feature_grid_size(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> tuple[int, int]:
|
||||
image_processor = self.get_image_processor()
|
||||
target_width = image_processor.size["width"]
|
||||
target_height = image_processor.size["height"]
|
||||
patch_width = image_processor.patch_size["width"]
|
||||
patch_height = image_processor.patch_size["height"]
|
||||
|
||||
image_height = int(image_height * optimal_scale_factor)
|
||||
image_width = int(image_width * optimal_scale_factor)
|
||||
if not (image_width <= target_width and image_height <= target_height):
|
||||
height_scale_factor = target_height / image_height
|
||||
width_scale_factor = target_width / image_width
|
||||
optimal_scale_factor = min(height_scale_factor, width_scale_factor)
|
||||
|
||||
ncols = math.ceil(image_width / patch_width)
|
||||
nrows = math.ceil(image_height / patch_height)
|
||||
return ncols, nrows
|
||||
```
|
||||
image_height = int(image_height * optimal_scale_factor)
|
||||
image_width = int(image_width * optimal_scale_factor)
|
||||
|
||||
ncols = math.ceil(image_width / patch_width)
|
||||
nrows = math.ceil(image_height / patch_height)
|
||||
return ncols, nrows
|
||||
```
|
||||
|
||||
Based on this, we can initially define our replacement tokens as:
|
||||
|
||||
```python
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
??? Code
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
```python
|
||||
def get_replacement(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
|
||||
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
|
||||
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
```
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
|
||||
# `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
|
||||
# `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
|
||||
return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
|
||||
```
|
||||
|
||||
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
|
||||
a BOS token (`<s>`) is also added to the promopt:
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
||||
tokenizer=self.tokenizer,
|
||||
prompts=prompts,
|
||||
scale_factors=scale_factors,
|
||||
max_tokens_to_generate=self.max_tokens_to_generate,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
add_BOS=True,
|
||||
add_beginning_of_answer_token=True,
|
||||
)
|
||||
```
|
||||
??? Code
|
||||
|
||||
```python
|
||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
||||
model_image_input = self.image_processor.preprocess_with_tokenizer_info(
|
||||
image_input=tensor_batch_images,
|
||||
image_present=image_present,
|
||||
image_unpadded_h=image_unpadded_heights,
|
||||
image_unpadded_w=image_unpadded_widths,
|
||||
image_placeholder_id=image_placeholder_id,
|
||||
image_newline_id=image_newline_id,
|
||||
variable_sized=True,
|
||||
)
|
||||
prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
|
||||
tokenizer=self.tokenizer,
|
||||
prompts=prompts,
|
||||
scale_factors=scale_factors,
|
||||
max_tokens_to_generate=self.max_tokens_to_generate,
|
||||
max_position_embeddings=self.max_position_embeddings,
|
||||
add_BOS=True,
|
||||
add_beginning_of_answer_token=True,
|
||||
)
|
||||
```
|
||||
|
||||
To assign the vision embeddings to only the image tokens, instead of a string
|
||||
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
|
||||
|
||||
```python
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id # `<s>`
|
||||
assert isinstance(bos_token_id, int)
|
||||
??? Code
|
||||
|
||||
def get_replacement_fuyu(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
```
|
||||
|
||||
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
||||
we can search for it to conduct the replacement at the start of the string:
|
||||
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
```python
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id
|
||||
bos_token_id = hf_config.bos_token_id # `<s>`
|
||||
assert isinstance(bos_token_id, int)
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
eot_token_id = tokenizer.bos_token_id
|
||||
assert isinstance(eot_token_id, int)
|
||||
|
||||
def get_replacement_fuyu(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
@@ -742,15 +744,52 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
```
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[eot_token_id],
|
||||
replacement=get_replacement_fuyu,
|
||||
)
|
||||
]
|
||||
```
|
||||
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
||||
we can search for it to conduct the replacement at the start of the string:
|
||||
|
||||
??? Code
|
||||
|
||||
```python
|
||||
def _get_prompt_updates(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
out_mm_kwargs: MultiModalKwargs,
|
||||
) -> Sequence[PromptUpdate]:
|
||||
hf_config = self.info.get_hf_config()
|
||||
bos_token_id = hf_config.bos_token_id
|
||||
assert isinstance(bos_token_id, int)
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
eot_token_id = tokenizer.bos_token_id
|
||||
assert isinstance(eot_token_id, int)
|
||||
|
||||
def get_replacement_fuyu(item_idx: int):
|
||||
images = mm_items.get_items("image", ImageProcessorItems)
|
||||
image_size = images.get_image_size(item_idx)
|
||||
|
||||
ncols, nrows = self.info.get_image_feature_grid_size(
|
||||
image_width=image_size.width,
|
||||
image_height=image_size.height,
|
||||
)
|
||||
image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
|
||||
[_NEWLINE_TOKEN_ID]) * nrows
|
||||
|
||||
return PromptUpdateDetails.select_token_id(
|
||||
image_tokens + [bos_token_id],
|
||||
embed_token_id=_IMAGE_TOKEN_ID,
|
||||
)
|
||||
|
||||
return [
|
||||
PromptReplacement(
|
||||
modality="image",
|
||||
target=[eot_token_id],
|
||||
replacement=get_replacement_fuyu,
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
## 5. Register processor-related classes
|
||||
|
||||
|
||||
Reference in New Issue
Block a user