Remove V0 Encoder-Decoder Support (#24907)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-15 21:17:14 -07:00
parent 5206ab20ba
commit 759ef49b15
47 changed files with 13 additions and 9661 deletions
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -13,8 +13,6 @@ from typing import NamedTuple

 from vllm import LLM, EngineArgs, PromptType, SamplingParams
 from vllm.assets.audio import AudioAsset
-from vllm.assets.image import ImageAsset
-from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser


@@ -23,113 +21,6 @@ class ModelRequestData(NamedTuple):
    prompts: Sequence[PromptType]


-def run_donut():
-    engine_args = EngineArgs(
-        model="naver-clova-ix/donut-base-finetuned-docvqa",
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
-        dtype="float16",
-        hf_overrides={"architectures": ["DonutForConditionalGeneration"]},
-    )
-
-    # The input image size for donut-base-finetuned-docvqa is 2560 x 1920,
-    # and the patch_size is 4 x 4.
-    # Therefore, the initial number of patches is:
-    # Height: 1920 / 4 = 480 patches
-    # Width: 2560 / 4 = 640 patches
-    # The Swin model uses a staged downsampling approach,
-    # defined by the "depths": [2, 2, 14, 2] configuration.
-    # Before entering stages 2, 3, and 4, a "Patch Merging" operation is performed,
-    # which halves the feature map's dimensions (dividing both height and width by 2).
-    # Before Stage 2: The size changes from 480 x 640 to (480/2) x (640/2) = 240 x 320.
-    # Before Stage 3: The size changes from 240 x 320 to (240/2) x (320/2) = 120 x 160.
-    # Before Stage 4: The size changes from 120 x 160 to (120/2) x (160/2) = 60 x 80.
-    # Because vLLM needs to fill the image features with an encoder_prompt,
-    # and the encoder_prompt will have `<pad>` tokens added when tokenized,
-    # we need to construct an encoder_prompt with a length of 60 x 80 - 1 = 4799.
-    prompts = [
-        {
-            "encoder_prompt": {
-                "prompt": "".join(["$"] * 4799),
-                "multi_modal_data": {
-                    "image": fetch_image(
-                        "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/0.jpg"
-                    )  # noqa: E501
-                },
-            },
-            "decoder_prompt": "<s_docvqa><s_question>What time is the coffee break?</s_question><s_answer>",  # noqa: E501
-        },
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
-def run_florence2():
-    engine_args = EngineArgs(
-        model="microsoft/Florence-2-large",
-        tokenizer="Isotr0py/Florence-2-tokenizer",
-        max_num_seqs=8,
-        trust_remote_code=True,
-        limit_mm_per_prompt={"image": 1},
-        dtype="half",
-    )
-
-    prompts = [
-        {  # implicit prompt with task token
-            "prompt": "<DETAILED_CAPTION>",
-            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
-        },
-        {  # explicit encoder/decoder prompt
-            "encoder_prompt": {
-                "prompt": "Describe in detail what is shown in the image.",
-                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
-            },
-            "decoder_prompt": "",
-        },
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
-def run_mllama():
-    engine_args = EngineArgs(
-        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_model_len=8192,
-        max_num_seqs=2,
-        limit_mm_per_prompt={"image": 1},
-        dtype="half",
-    )
-
-    prompts = [
-        {  # Implicit prompt
-            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
-            "multi_modal_data": {
-                "image": ImageAsset("stop_sign").pil_image,
-            },
-        },
-        {  # Explicit prompt
-            "encoder_prompt": {
-                "prompt": "<|image|>",
-                "multi_modal_data": {
-                    "image": ImageAsset("stop_sign").pil_image,
-                },
-            },
-            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
-        },
-    ]
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompts=prompts,
-    )
-
-
 def run_whisper():
    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

@@ -166,9 +57,6 @@ def run_whisper():


 model_example_map = {
-    "donut": run_donut,
-    "florence2": run_florence2,
-    "mllama": run_mllama,
    "whisper": run_whisper,
 }

@@ -182,7 +70,7 @@ def parse_args():
        "--model-type",
        "-m",
        type=str,
-        default="mllama",
+        default="whisper",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )