[Doc] [2/N] Add Fuyu E2E example for multimodal processor (#13331)

2025-02-15 23:06:23 +08:00
parent 54ed913f34
commit 367cb8ce8c
2 changed files with 527 additions and 29 deletions
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -104,6 +104,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
        image_processor = self.get_image_processor()
        target_width = image_processor.size["width"]
        target_height = image_processor.size["height"]
+        patch_width = image_processor.patch_size["width"]
+        patch_height = image_processor.patch_size["height"]

        if not (image_width <= target_width and image_height <= target_height):
            height_scale_factor = target_height / image_height
@@ -113,8 +115,8 @@ class FuyuProcessingInfo(BaseProcessingInfo):
            image_height = int(image_height * optimal_scale_factor)
            image_width = int(image_width * optimal_scale_factor)

-        ncols = math.ceil(image_width / 30)
-        nrows = math.ceil(image_height / 30)
+        ncols = math.ceil(image_width / patch_width)
+        nrows = math.ceil(image_height / patch_height)
        return ncols, nrows

    def get_image_size_with_most_features(self) -> ImageSize: