[VLM] Merged multi-modal processor for LLaVA-NeXT (#11682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-03 00:39:27 +08:00
parent b6087a6bee
commit 8c38ee7007
14 changed files with 609 additions and 555 deletions
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -76,7 +76,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor):
        return ImageSize(width=target_size["width"],
                         height=target_size["height"])

-    def _get_image_grid_size(
+    def _get_image_feature_grid_size(
        self,
        *,
        image_width: int,
@@ -99,7 +99,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor):
    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
        target_width, target_height = self._get_image_target_size()

-        max_ncols, max_nrows = self._get_image_grid_size(
+        max_ncols, max_nrows = self._get_image_feature_grid_size(
            image_width=target_width,
            image_height=target_height,
        )
@@ -172,7 +172,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor):
            images = mm_items.get_items("image", ImageProcessorItems)
            image_size = images.get_image_size(item_idx)

-            ncols, nrows = self._get_image_grid_size(
+            ncols, nrows = self._get_image_feature_grid_size(
                image_width=image_size.width,
                image_height=image_size.height,
            )