[VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-04 19:40:53 +08:00
parent 300acb8347
commit eed11ebee9
31 changed files with 1104 additions and 973 deletions
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -67,9 +67,6 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
    def _get_hf_processor(self) -> LlavaNextProcessor:
        return self.ctx.get_hf_processor(LlavaNextProcessor)

-    def _get_image_token(self) -> str:
-        return self._get_hf_processor().image_token
-
    def _get_mm_fields_config(
        self,
        hf_inputs: BatchFeature,
@@ -81,6 +78,9 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
            image_embeds=MultiModalFieldConfig.batched("image"),
        )

+    def _get_image_token(self) -> str:
+        return self._get_hf_processor().image_token
+
    def _get_max_image_tokens(self) -> int:
        largest_feature_size, _ = self._get_pinpoint_with_most_features()
        return largest_feature_size
@@ -97,20 +97,20 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
        image_height: int,
    ) -> int:
        hf_config = self._get_hf_config()
+        vision_encoder_info = self._vision_encoder_info

        base_feature_size = self._apply_feature_select_strategy(
            hf_config.vision_feature_select_strategy,
-            self._vision_encoder_info.get_num_image_tokens(
+            vision_encoder_info.get_num_image_tokens(
                image_width=image_width,
                image_height=image_height,
            ),
        )
-        num_patches = self._vision_encoder_info.get_num_patches()

        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
            image_size=(image_height, image_width),
            grid_pinpoints=hf_config.image_grid_pinpoints,
-            patch_size=self._vision_encoder_info.get_image_size(),
+            patch_size=vision_encoder_info.get_image_size(),
        )

        (
@@ -119,7 +119,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):
        ) = self._get_num_unpadded_features(
            original_height=image_height,
            original_width=image_width,
-            npatches=num_patches,
+            npatches=vision_encoder_info.get_patch_grid_length(),
            num_patch_height=num_patch_height,
            num_patch_width=num_patch_width,
        )
@@ -155,6 +155,7 @@ class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor):

        unpadded_features = current_height * current_width
        newline_features = current_height
+
        return (unpadded_features, newline_features)

    def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: