[VLM] Merged multi-modal processor for LLaVA-NeXT (#11682)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -38,6 +38,7 @@ from vllm.sequence import IntermediateTensors, SequenceData
|
||||
from .interfaces import SupportsMultiModal, SupportsPP
|
||||
from .utils import (init_vllm_registered_model, maybe_prefix,
|
||||
merge_multimodal_embeddings)
|
||||
from .vision import VisionEncoderInfo
|
||||
|
||||
try:
|
||||
from xformers import ops as xops
|
||||
@@ -697,10 +698,18 @@ def get_pixtral_hf_patch_grid_length(*, image_size: int,
|
||||
return image_size // patch_size
|
||||
|
||||
|
||||
def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int:
|
||||
grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size,
|
||||
patch_size=patch_size)
|
||||
return grid_length * grid_length
|
||||
def get_pixtral_hf_image_feature_size(
|
||||
*,
|
||||
image_size: int,
|
||||
patch_size: int,
|
||||
) -> int:
|
||||
grid_length = get_pixtral_hf_patch_grid_length(
|
||||
image_size=image_size,
|
||||
patch_size=patch_size,
|
||||
)
|
||||
|
||||
# Consider the image_break_token
|
||||
return (grid_length + 1) * grid_length
|
||||
|
||||
|
||||
def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int:
|
||||
@@ -730,13 +739,16 @@ def dummy_image_for_pixtral_hf(
|
||||
return {"image": image if num_images == 1 else [image] * num_images}
|
||||
|
||||
|
||||
def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
|
||||
image_width: int,
|
||||
image_height: int) -> Tuple[int, int]:
|
||||
# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
|
||||
# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501
|
||||
max_width, max_height = hf_config.image_size, hf_config.image_size
|
||||
patch_width, patch_height = hf_config.patch_size, hf_config.patch_size
|
||||
# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501
|
||||
# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180
|
||||
def get_pixtral_hf_image_feature_grid_size(
|
||||
hf_config: PixtralVisionConfig,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> tuple[int, int]:
|
||||
max_width = max_height = hf_config.image_size
|
||||
patch_width = patch_height = hf_config.patch_size
|
||||
|
||||
ratio = max(image_width / max_width, image_height / max_height)
|
||||
|
||||
@@ -744,12 +756,38 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
|
||||
image_width = int(math.ceil(image_width / ratio))
|
||||
image_height = int(math.ceil(image_height / ratio))
|
||||
|
||||
num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens(
|
||||
nrows, ncols = _get_pixtral_hf_num_image_tokens(
|
||||
(image_height, image_width),
|
||||
(patch_height, patch_width),
|
||||
)
|
||||
) # type: ignore
|
||||
|
||||
return num_width_tokens, num_height_tokens
|
||||
return ncols, nrows
|
||||
|
||||
|
||||
class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]):
|
||||
|
||||
def get_num_image_tokens(
|
||||
self,
|
||||
*,
|
||||
image_width: int,
|
||||
image_height: int,
|
||||
) -> int:
|
||||
return get_pixtral_hf_image_feature_size(
|
||||
image_size=self.vision_config.image_size,
|
||||
patch_size=self.get_image_size(),
|
||||
)
|
||||
|
||||
def get_max_image_tokens(self) -> int:
|
||||
return get_max_pixtral_hf_image_tokens(self.vision_config)
|
||||
|
||||
def get_num_patches(self) -> int:
|
||||
return get_pixtral_hf_patch_grid_length(
|
||||
image_size=self.vision_config.image_size,
|
||||
patch_size=self.vision_config.patch_size,
|
||||
)
|
||||
|
||||
def get_image_size(self) -> int:
|
||||
return self.vision_config.image_size
|
||||
|
||||
|
||||
class PixtralHFMLP(nn.Module):
|
||||
|
||||
Reference in New Issue
Block a user