[New Model]Donut model (#23229)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-08-24 20:52:24 +08:00
parent 5e021b4981
commit 416f05929a
11 changed files with 1240 additions and 3 deletions
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -13,6 +13,7 @@ from typing import NamedTuple
 from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
+from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser


@@ -21,6 +22,50 @@ class ModelRequestData(NamedTuple):
    prompts: Sequence[PromptType]


+def run_donut():
+    engine_args = EngineArgs(
+        model="naver-clova-ix/donut-base-finetuned-docvqa",
+        max_num_seqs=2,
+        limit_mm_per_prompt={"image": 1},
+        dtype="float16",
+        hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
+    )
+
+    # The input image size for donut-base-finetuned-docvqa is 2560 x 1920,
+    # and the patch_size is 4 x 4.
+    # Therefore, the initial number of patches is:
+    # Height: 1920 / 4 = 480 patches
+    # Width: 2560 / 4 = 640 patches
+    # The Swin model uses a staged downsampling approach,
+    # defined by the "depths": [2, 2, 14, 2] configuration.
+    # Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
+    # which halves the feature map's dimensions (dividing both height and width by 2).
+    # Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320.
+    # Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160.
+    # Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80.
+    # Because vLLM needs to fill the image features with an encoder_prompt,
+    # and the encoder_prompt will have `<pad>` tokens added when tokenized,
+    # we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799.
+    prompts = [
+        {
+            "encoder_prompt": {
+                "prompt": "".join(["$"] * 4799),
+                "multi_modal_data": {
+                    "image": fetch_image(
+                        "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
+                    )  # noqa: E501
+                },
+            },
+            "decoder_prompt": "<s_docvqa><s_question>What time is the coffee break?</s_question><s_answer>",  # noqa: E501
+        },
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 def run_florence2():
    engine_args = EngineArgs(
        model="microsoft/Florence-2-large",
@@ -118,6 +163,7 @@ def run_whisper():


 model_example_map = {
+    "donut": run_donut,
    "florence2": run_florence2,
    "mllama": run_mllama,
    "whisper": run_whisper,