[CI/Build] Refactor processing tests (#27470)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-10-26 00:14:30 +08:00
parent a99564ac5b
commit 66a168a197
4 changed files with 174 additions and 230 deletions
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from collections.abc import Set as AbstractSet
 from functools import partial

 import numpy as np
@@ -22,14 +23,17 @@ from vllm.multimodal.cache import MultiModalProcessorOnlyCache
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.transformers_utils.tokenizer import (
-    AnyTokenizer,
    MistralTokenizer,
    cached_tokenizer_from_config,
    encode_tokens,
 )

 from ....multimodal.utils import random_audio, random_image, random_video
-from ...registry import HF_EXAMPLE_MODELS
+from ...registry import (
+    _MULTIMODAL_EXAMPLE_MODELS,
+    _TRANSFORMERS_BACKEND_MODELS,
+    HF_EXAMPLE_MODELS,
+)


 def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
@@ -83,6 +87,119 @@ def qwen3_vl_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
    return mm_data


+# For some multimodal models, tokenizer will always add bos_token
+# at the beginning of prompt by default, causing hf_processor outputs
+# incorrect token ids. So we need use `add_special_tokens=False` here
+# to leave bos_token to be added by the processor.
+_ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "ovis": False,
+    "ovis2_5": False,
+    "paligemma": False,
+    "ultravox": False,
+    "whisper": False,
+}
+
+_IGNORE_MM_KEYS = {
+    # In Ultravox, the audio_features can be different depending on padding
+    # The slight difference should not be a problem though, since
+    # attention_mask lets us ignore the difference.
+    "ultravox": {"audio_features"},
+}
+
+MM_DATA_PATCHES = {
+    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
+    "glm4v": glm4_1v_patch_mm_data,
+    "glm4v_moe": glm4_1v_patch_mm_data,
+    "qwen3_vl": qwen3_vl_patch_mm_data,
+    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
+}
+
+
+def _iter_model_ids_to_test(model_arch_list: AbstractSet[str]):
+    for model_arch in model_arch_list:
+        model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+        yield model_info.default
+
+        for extra_type, extra_model_id in model_info.extras.items():
+            if "fp" in extra_type:
+                continue  # Redundant to test quantized models
+
+            yield extra_model_id
+
+
+def _get_model_ids_to_test(model_arch_list: AbstractSet[str]):
+    return list(_iter_model_ids_to_test(model_arch_list))
+
+
+def get_model_ids_to_test():
+    transformers_arch_ids = {
+        model_id
+        for info in _TRANSFORMERS_BACKEND_MODELS.values()
+        for model_id in (info.default, *info.extras.values())
+    }
+    vllm_only_archs = {
+        arch
+        for arch, info in _MULTIMODAL_EXAMPLE_MODELS.items()
+        if not any(
+            model_id in transformers_arch_ids
+            for model_id in (info.default, *info.extras.values())
+        )
+    }
+
+    return _get_model_ids_to_test(vllm_only_archs)
+
+
+def get_text_token_prompts(
+    processor: BaseMultiModalProcessor,
+    mm_data: MultiModalDataDict,
+):
+    dummy_inputs = processor.dummy_inputs
+    tokenizer = processor.info.get_tokenizer()
+    model_config = processor.info.ctx.model_config
+
+    model_type = model_config.hf_config.model_type
+    if model_type in MM_DATA_PATCHES:
+        mm_data = MM_DATA_PATCHES[model_type](mm_data)
+
+    parsed_data = processor.data_parser.parse_mm_data(mm_data)
+    mm_counts = {k: len(vs) for k, vs in parsed_data.items()}
+
+    text_prompt: str | None
+    token_prompt: list[int]
+    if isinstance(tokenizer, MistralTokenizer):
+        images = parsed_data.get("image", [])
+        request = ChatCompletionRequest(
+            messages=[
+                UserMessage(
+                    content=[
+                        TextChunk(text=""),
+                        *(ImageChunk(image=image) for image in images),
+                    ]
+                ),
+            ]
+        )
+        res = tokenizer.mistral.encode_chat_completion(request)
+
+        # Mistral does not support decode_tokens with skip_special_tokens=False
+        text_prompt = None
+        token_prompt = res.tokens
+    else:
+        inputs = dummy_inputs.get_dummy_processor_inputs(
+            model_config.max_model_len,
+            mm_counts,
+        )
+        assert isinstance(inputs.prompt, str)
+
+        text_prompt = inputs.prompt
+        token_prompt = encode_tokens(
+            tokenizer,
+            text_prompt,
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
+        )
+
+    return text_prompt, token_prompt
+
+
 def _test_processing_correctness(
    model_id_or_arch: str,
    hit_rate: float,
@@ -148,8 +265,6 @@ def _test_processing_correctness(

    baseline_processor = factories.build_processor(ctx, cache=None)
    cached_processor = factories.build_processor(ctx, cache=cache)
-    dummy_inputs = baseline_processor.dummy_inputs
-    tokenizer = baseline_processor.info.get_tokenizer()

    rng = np.random.RandomState(0)

@@ -175,29 +290,6 @@ def _test_processing_correctness(
            for k, limit in limit_mm_per_prompt_ints.items()
        }

-        mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-
-        # Mistral chat outputs tokens directly, rather than text prompts
-        if isinstance(tokenizer, MistralTokenizer):
-            images = mm_data.get("image", [])
-            request = ChatCompletionRequest(
-                messages=[
-                    UserMessage(
-                        content=[
-                            TextChunk(text=""),
-                            *(ImageChunk(image=image) for image in images),
-                        ]
-                    ),
-                ]
-            )
-            res = tokenizer.mistral.encode_chat_completion(request)
-            prompt = res.tokens
-        else:
-            prompt = dummy_inputs.get_dummy_processor_inputs(
-                model_config.max_model_len,
-                mm_counts,
-            ).prompt
-
        # Drop unnecessary keys and test single -> multi conversion
        if rng.rand() < simplify_rate:
            for k in list(mm_data.keys()):
@@ -208,8 +300,6 @@ def _test_processing_correctness(

        _test_processing_correctness_one(
            model_config,
-            tokenizer,
-            prompt,
            mm_data,
            baseline_processor,
            cached_processor,
@@ -217,59 +307,17 @@ def _test_processing_correctness(
        )


-# For some multimodal models, tokenizer will always add bos_token
-# at the beginning of prompt by default, causing hf_processor outputs
-# incorrect token ids. So we need use `add_special_tokens=False` here
-# to leave bos_token to be added by the processor.
-_ADD_SPECIAL_TOKENS_OVERRIDES = {
-    "ovis": False,
-    "ovis2_5": False,
-    "paligemma": False,
-    "ultravox": False,
-    "whisper": False,
-}
-
-_IGNORE_MM_KEYS = {
-    # In Ultravox, the audio_features can be different depending on padding
-    # The slight difference should not be a problem though, since
-    # attention_mask lets us ignore the difference.
-    "ultravox": {"audio_features"},
-}
-
-MM_DATA_PATCHES = {
-    # GLM4.1V and Qwen3-VL requires video metadata to be included in the input
-    "glm4v": glm4_1v_patch_mm_data,
-    "glm4v_moe": glm4_1v_patch_mm_data,
-    "qwen3_vl": qwen3_vl_patch_mm_data,
-    "qwen3_vl_moe": qwen3_vl_patch_mm_data,
-}
-
-
 def _test_processing_correctness_one(
    model_config: ModelConfig,
-    tokenizer: AnyTokenizer,
-    prompt: str | list[int],
    mm_data: MultiModalDataDict,
    baseline_processor: BaseMultiModalProcessor,
    cached_processor: BaseMultiModalProcessor,
    batch_idx: int,
 ):
    model_type = model_config.hf_config.model_type
-    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
-    if model_type in MM_DATA_PATCHES:
-        mm_data = MM_DATA_PATCHES[model_type](mm_data)

-    if isinstance(prompt, str):
-        text_prompt = prompt
-        token_prompt = encode_tokens(
-            tokenizer,
-            prompt,
-            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
-        )
-    else:
-        # Mistral does not support decode_tokens with skip_special_tokens=False
-        text_prompt = None
-        token_prompt = prompt
+    text_prompt, token_prompt = get_text_token_prompts(baseline_processor, mm_data)
+    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())

    baseline_tokenized_result = baseline_processor.apply(
        token_prompt,
@@ -324,81 +372,7 @@ def _test_processing_correctness_one(
        )


-@pytest.mark.parametrize(
-    "model_id",
-    [
-        "rhymes-ai/Aria",
-        "CohereForAI/aya-vision-8b",
-        "Open-Bee/Bee-8B-RL",
-        "Salesforce/blip2-opt-2.7b",
-        "facebook/chameleon-7b",
-        "CohereLabs/command-a-vision-07-2025",
-        "deepseek-ai/deepseek-vl2-tiny",
-        "deepseek-ai/DeepSeek-OCR",
-        "baidu/ERNIE-4.5-VL-28B-A3B-PT",
-        "adept/fuyu-8b",
-        "google/gemma-3-4b-it",
-        "google/gemma-3n-E2B-it",
-        "zai-org/glm-4v-9b",
-        "zai-org/GLM-4.1V-9B-Thinking",
-        "zai-org/GLM-4.5V",
-        "ibm-granite/granite-speech-3.3-2b",
-        "h2oai/h2ovl-mississippi-800m",
-        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
-        "HuggingFaceM4/Idefics3-8B-Llama3",
-        "internlm/Intern-S1",
-        "OpenGVLab/InternVL2-1B",
-        "OpenGVLab/InternVL3-1B",
-        "OpenGVLab/InternVL3_5-1B",
-        "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
-        "OpenGVLab/InternVL3_5-30B-A3B",
-        "Kwai-Keye/Keye-VL-8B-Preview",
-        "Kwai-Keye/Keye-VL-1_5-8B",
-        "moonshotai/Kimi-VL-A3B-Instruct",
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        "llava-hf/llava-1.5-7b-hf",
-        "llava-hf/llava-v1.6-mistral-7b-hf",
-        "llava-hf/LLaVA-NeXT-Video-7B-hf",
-        "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
-        "TIGER-Lab/Mantis-8B-siglip-llama3",
-        "mispeech/midashenglm-7b",
-        "openbmb/MiniCPM-Llama3-V-2_5",
-        "openbmb/MiniCPM-o-2_6",
-        "openbmb/MiniCPM-V-2_6",
-        "MiniMaxAI/MiniMax-VL-01",
-        "allenai/Molmo-7B-D-0924",
-        "allenai/Molmo-7B-O-0924",
-        "nvidia/NVLM-D-72B",
-        "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1",
-        "AIDC-AI/Ovis1.6-Gemma2-9B",
-        "AIDC-AI/Ovis1.6-Llama3.2-3B",
-        "AIDC-AI/Ovis2-1B",
-        "AIDC-AI/Ovis2.5-2B",
-        "google/paligemma-3b-mix-224",
-        "google/paligemma2-3b-ft-docci-448",
-        "microsoft/Phi-3.5-vision-instruct",
-        "microsoft/Phi-4-multimodal-instruct",
-        "mistralai/Pixtral-12B-2409",
-        "mistral-community/pixtral-12b",
-        "Qwen/Qwen-VL-Chat",
-        "Qwen/Qwen2-VL-2B-Instruct",
-        "Qwen/Qwen2.5-VL-3B-Instruct",
-        "Qwen/Qwen2-Audio-7B-Instruct",
-        "Qwen/Qwen2.5-Omni-3B",
-        "Qwen/Qwen3-VL-4B-Instruct",
-        "Qwen/Qwen3-VL-30B-A3B-Instruct",
-        "Qwen/Qwen3-Omni-30B-A3B-Instruct",
-        "YannQi/R-4B",
-        "Skywork/Skywork-R1V-38B",
-        "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
-        "stepfun-ai/step3",
-        "fixie-ai/ultravox-v0_5-llama-3_2-1b",
-        "openai/whisper-large-v3",
-        "omni-research/Tarsier-7b",
-        "omni-research/Tarsier2-Recap-7b",
-        "mistralai/Voxtral-Mini-3B-2507",
-    ],
-)
+@pytest.mark.parametrize("model_id", get_model_ids_to_test())
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
@pytest.mark.parametrize("num_batches", [32])
@pytest.mark.parametrize("simplify_rate", [1.0])
@@ -409,7 +383,12 @@ def test_processing_correctness(
    simplify_rate: float,
 ):
    if model_id == "google/gemma-3n-E2B-it":
-        pytest.skip("Skipping gemma-3n-E2B-it due to transformers #39911 bug.")
+        pytest.skip("Fix later")
+    if model_id == "OpenGVLab/InternVL2-2B":
+        pytest.skip("Fix later")
+    if model_id == "jinaai/jina-reranker-m0":
+        pytest.skip("Fix later")
+
    _test_processing_correctness(
        model_id,
        hit_rate=hit_rate,