[Model] Support VLMs with transformers backend (#20543)

Signed-off-by: raushan <raushan@huggingface.co>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
Raushan Turganbay
2025-07-20 15:25:50 +02:00
committed by GitHub
parent 51ba839555
commit 9499e26e2a
7 changed files with 625 additions and 87 deletions

View File

@@ -35,6 +35,8 @@ if current_platform.is_rocm():
REQUIRES_V0_MODELS = [
# V1 Test: not enough KV cache space in C1.
"fuyu",
# V1 Test: Deadlock issue when processing mm_inputs
"llava-onevision-transformers",
]
# yapf: disable
@@ -170,6 +172,79 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
),
#### Transformers fallback to test
## To reduce test burden, we only test batching arbitrary image size
# Dynamic image length and number of patches
"llava-onevision-transformers": VLMTestInfo(
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "transformers",
"disable_mm_preprocessor_cache": True,
"enable_prefix_caching": False,
},
marks=[pytest.mark.core_model],
),
# FIXME(Isotr0py): Enable this test after
# https://github.com/huggingface/transformers/pull/39470 released
# "idefics3-transformers": VLMTestInfo(
# models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
# prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
# img_idx_to_prompt=lambda idx: "<image>",
# max_model_len=8192,
# max_num_seqs=2,
# auto_cls=AutoModelForImageTextToText,
# hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
# image_size_factors=[(0.25, 0.5, 1.0)],
# vllm_runner_kwargs={
# "model_impl": "transformers",
# "disable_mm_preprocessor_cache": True,
# "enable_prefix_caching": False,
# },
# marks=[pytest.mark.core_model],
# ),
# Pixel values from processor are not 4D or 5D arrays
"qwen2_5_vl-transformers": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={
"model_impl": "transformers",
"disable_mm_preprocessor_cache": True,
"enable_prefix_caching": False,
},
marks=[large_gpu_mark(min_gb=32)],
),
# Check "auto" with fallback to transformers
"internvl-transformers": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
max_model_len=4096,
use_tokenizer_eos=True,
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "auto",
"disable_mm_preprocessor_cache": True,
"enable_prefix_caching": False,
},
auto_cls=AutoModelForImageTextToText,
marks=[pytest.mark.core_model],
),
#### Extended model tests
"aria": VLMTestInfo(
models=["rhymes-ai/Aria"],