[Bugfix] Standardize getting number of image patches/tokens (#34358)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-02-13 12:47:01 +08:00
committed by GitHub
parent 6afa587d31
commit 372b2e762a
29 changed files with 319 additions and 331 deletions

View File

@@ -3,7 +3,9 @@
"""Tests for smolvlm's multimodal preprocessing kwargs."""
import pytest
from packaging.version import Version
from transformers import SmolVLMConfig
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -11,6 +13,10 @@ from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) < Version("5.2.0"),
reason="See https://github.com/huggingface/transformers/pull/43948",
)
@pytest.mark.parametrize("model_id", ["HuggingFaceTB/SmolVLM2-2.2B-Instruct"])
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img"),
@@ -63,7 +69,11 @@ def test_processor_override(
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
hf_processed_inputs = hf_processor(
text=prompt,
images=mm_data["image"],
**processor.info.ctx.get_merged_mm_kwargs(hf_processor_mm_kwargs),
)
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
# Ensure we have the right number of placeholders per num_crops size