[VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-01-04 19:40:53 +08:00
committed by GitHub
parent 300acb8347
commit eed11ebee9
31 changed files with 1104 additions and 973 deletions

View File

@@ -67,9 +67,6 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
def _get_hf_processor(self) -> LlavaNextProcessor:
return self.ctx.get_hf_processor(LlavaNextProcessor)
def _get_image_token(self) -> str:
return self._get_hf_processor().image_token
def _get_mm_fields_config(
self,
hf_inputs: BatchFeature,
@@ -81,6 +78,9 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
image_embeds=MultiModalFieldConfig.batched("image"),
)
def _get_image_token(self) -> str:
return self._get_hf_processor().image_token
def _get_max_image_tokens(self) -> int:
largest_feature_size, _ = self._get_pinpoint_with_most_features()
return largest_feature_size
@@ -97,20 +97,20 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
image_height: int,
) -> int:
hf_config = self._get_hf_config()
vision_encoder_info = self._vision_encoder_info
base_feature_size = self._apply_feature_select_strategy(
hf_config.vision_feature_select_strategy,
self._vision_encoder_info.get_num_image_tokens(
vision_encoder_info.get_num_image_tokens(
image_width=image_width,
image_height=image_height,
),
)
num_patches = self._vision_encoder_info.get_num_patches()
num_patch_height, num_patch_width = get_anyres_image_grid_shape(
image_size=(image_height, image_width),
grid_pinpoints=hf_config.image_grid_pinpoints,
patch_size=self._vision_encoder_info.get_image_size(),
patch_size=vision_encoder_info.get_image_size(),
)
(
@@ -119,7 +119,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
) = self._get_num_unpadded_features(
original_height=image_height,
original_width=image_width,
npatches=num_patches,
npatches=vision_encoder_info.get_patch_grid_length(),
num_patch_height=num_patch_height,
num_patch_width=num_patch_width,
)
@@ -155,6 +155,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
unpadded_features = current_height * current_width
newline_features = current_height
return (unpadded_features, newline_features)
def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: