[Model] Support VLMs with transformers backend (#20543)
Signed-off-by: raushan <raushan@huggingface.co> Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
committed by
GitHub
parent
51ba839555
commit
9499e26e2a
@@ -35,6 +35,8 @@ if current_platform.is_rocm():
|
||||
REQUIRES_V0_MODELS = [
|
||||
# V1 Test: not enough KV cache space in C1.
|
||||
"fuyu",
|
||||
# V1 Test: Deadlock issue when processing mm_inputs
|
||||
"llava-onevision-transformers",
|
||||
]
|
||||
|
||||
# yapf: disable
|
||||
@@ -170,6 +172,79 @@ VLM_TEST_SETTINGS = {
|
||||
hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
#### Transformers fallback to test
|
||||
## To reduce test burden, we only test batching arbitrary image size
|
||||
# Dynamic image length and number of patches
|
||||
"llava-onevision-transformers": VLMTestInfo(
|
||||
models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
max_model_len=16384,
|
||||
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
"disable_mm_preprocessor_cache": True,
|
||||
"enable_prefix_caching": False,
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# FIXME(Isotr0py): Enable this test after
|
||||
# https://github.com/huggingface/transformers/pull/39470 released
|
||||
# "idefics3-transformers": VLMTestInfo(
|
||||
# models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
|
||||
# test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
# prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
|
||||
# img_idx_to_prompt=lambda idx: "<image>",
|
||||
# max_model_len=8192,
|
||||
# max_num_seqs=2,
|
||||
# auto_cls=AutoModelForImageTextToText,
|
||||
# hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
|
||||
# image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
# vllm_runner_kwargs={
|
||||
# "model_impl": "transformers",
|
||||
# "disable_mm_preprocessor_cache": True,
|
||||
# "enable_prefix_caching": False,
|
||||
# },
|
||||
# marks=[pytest.mark.core_model],
|
||||
# ),
|
||||
# Pixel values from processor are not 4D or 5D arrays
|
||||
"qwen2_5_vl-transformers": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.2, 0.15)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
"disable_mm_preprocessor_cache": True,
|
||||
"enable_prefix_caching": False,
|
||||
},
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
# Check "auto" with fallback to transformers
|
||||
"internvl-transformers": VLMTestInfo(
|
||||
models=["OpenGVLab/InternVL3-1B-hf"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
|
||||
img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
|
||||
max_model_len=4096,
|
||||
use_tokenizer_eos=True,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "auto",
|
||||
"disable_mm_preprocessor_cache": True,
|
||||
"enable_prefix_caching": False,
|
||||
},
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
#### Extended model tests
|
||||
"aria": VLMTestInfo(
|
||||
models=["rhymes-ai/Aria"],
|
||||
|
||||
Reference in New Issue
Block a user