Remove V0 Encoder-Decoder Support (#24907)

Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-09-15 21:17:14 -07:00
parent 5206ab20ba
commit 759ef49b15
47 changed files with 13 additions and 9661 deletions
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -167,8 +167,6 @@ def _test_processing_correctness(
 # incorrect token ids. So we need use `add_special_tokens=False` here
 # to leave bos_token to be added by the processor.
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
-    "donut": False,
-    "mllama": False,
    "ovis": False,
    "ovis2_5": False,
    "paligemma": False,
@@ -278,9 +276,7 @@ def _test_processing_correctness_one(
    "facebook/chameleon-7b",
    "CohereLabs/command-a-vision-07-2025",
    "deepseek-ai/deepseek-vl2-tiny",
-    "naver-clova-ix/donut-base-finetuned-docvqa",
    "baidu/ERNIE-4.5-VL-28B-A3B-PT",
-    "microsoft/Florence-2-base",
    "adept/fuyu-8b",
    "google/gemma-3-4b-it",
    "google/gemma-3n-E2B-it",
@@ -305,7 +301,6 @@ def _test_processing_correctness_one(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "TIGER-Lab/Mantis-8B-siglip-llama3",
    "mispeech/midashenglm-7b",
    "openbmb/MiniCPM-Llama3-V-2_5",
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -1,72 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests for mllama's multimodal preprocessing and profiling."""
-import pytest
-from transformers import MllamaConfig
-
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.profiling import MultiModalProfiler
-
-from ...utils import build_model_context
-
-
-@pytest.mark.parametrize("model_id",
-                         ["meta-llama/Llama-3.2-11B-Vision-Instruct"])
-@pytest.mark.parametrize("max_model_len", [4096, 8192, 25600, 131072])
-@pytest.mark.parametrize("max_num_seqs", [1, 2, 8])
-def test_profiling(
-    model_id: str,
-    max_model_len: int,
-    max_num_seqs: int,
-):
-    # regression test for https://github.com/vllm-project/vllm/issues/13929
-    from vllm.model_executor.models.mllama import calc_token_per_chunk
-
-    model_config_kwargs = {
-        "max_model_len": max_model_len,
-    }
-    ctx = build_model_context(
-        model_id,
-        model_config_kwargs=model_config_kwargs,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    mm_config = ctx.get_mm_config()
-    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
-    profiler = MultiModalProfiler(processor)
-
-    dummy_encoder_data = profiler.get_encoder_dummy_data(
-        max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
-    )
-    dummy_mm_data = processor.dummy_inputs.get_dummy_processor_inputs(
-        max_model_len,
-        mm_counts=mm_config.limit_per_prompt,
-    )
-
-    hf_config = ctx.get_hf_config(MllamaConfig)
-    image_size = hf_config.vision_config.image_size
-    encoder_seq_lens = [len(dummy_encoder_data.prompt_token_ids)
-                        ] * max_num_seqs
-
-    mm_data = processor.apply(
-        prompt=dummy_mm_data.prompt,
-        mm_data=dummy_mm_data.mm_data,
-        hf_processor_mm_kwargs=dict(),
-    )["mm_kwargs"].get_data()
-
-    # Get the actual number of encoder tokens for each sample.
-    # Because attn_metadata.encoder_seq_lens only counts the last
-    # group of images for each sample, which is used to cheat the
-    # block manager to allocate blocks for those images only.
-    # See MllamaMultiModalProcessor for more details.
-    num_tiles = [[t] for t in mm_data.pop("num_tiles")]
-    num_tokens_per_tile = calc_token_per_chunk(image_size)
-    actual_encoder_seq_lens = [
-        sum(num_tile) * num_tokens_per_tile for num_tile in num_tiles
-    ]
-
-    # simulate mllama image-present prefill.
-    for actual_len, last_group_len in zip(actual_encoder_seq_lens,
-                                          encoder_seq_lens):
-        assert actual_len >= last_group_len
--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -31,7 +31,6 @@ from ...utils import dummy_hf_overrides

 ARCH_TO_SKIP = {
    "MolmoForCausalLM": "incompatible requirements",
-    "Florence2ForConditionalGeneration": "not supported in V1",
 }
 ARCH_NEEDS_EXTRAS = [
    "InternVLChatModel",