[VLM] Merged multi-modal processor for LLaVA-NeXT (#11682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-01-03 00:39:27 +08:00
committed by GitHub
parent b6087a6bee
commit 8c38ee7007
14 changed files with 609 additions and 555 deletions

View File

@@ -28,6 +28,8 @@ from vllm.multimodal.utils import (cached_get_tokenizer,
resolve_visual_encoder_outputs)
from vllm.sequence import SequenceData
from .vision import VisionEncoderInfo
def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
# Since interpolation is applied, the image size need not be divisible
@@ -156,6 +158,29 @@ def input_processor_for_siglip(
multi_modal_placeholders={"image": ranges})
class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]):
def get_num_image_tokens(
self,
*,
image_width: int,
image_height: int,
) -> int:
return get_siglip_image_feature_size(self.vision_config)
def get_max_image_tokens(self) -> int:
return get_max_siglip_image_tokens(self.vision_config)
def get_num_patches(self) -> int:
return get_siglip_patch_grid_length(
image_size=self.vision_config.image_size,
patch_size=self.vision_config.patch_size,
)
def get_image_size(self) -> int:
return self.vision_config.image_size
# Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa
class SiglipVisionEmbeddings(nn.Module):