[Model] Always use Transformers backend for PaliGemma and Gemma3-MM (#26715)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -113,25 +113,6 @@ VLM_TEST_SETTINGS = {
|
||||
dtype="bfloat16" if current_platform.is_cpu() else "auto",
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"paligemma": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
# Paligemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
dtype="bfloat16",
|
||||
marks=[
|
||||
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
||||
],
|
||||
),
|
||||
"qwen2_5_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
@@ -196,14 +177,24 @@ VLM_TEST_SETTINGS = {
|
||||
# Gemma3 has bidirectional mask on images
|
||||
"gemma3-transformers": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
# TODO: Support `do_pan_and_scan` in transformers backend
|
||||
# patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
# "mm_processor_kwargs": {"do_pan_and_scan": True},
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
@@ -222,6 +213,27 @@ VLM_TEST_SETTINGS = {
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# PaliGemma has PrefixLM attention
|
||||
"paligemma-transformers": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
# PaliGemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# Pixel values from processor are not 4D or 5D arrays
|
||||
"qwen2_5_vl-transformers": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
@@ -348,24 +360,6 @@ VLM_TEST_SETTINGS = {
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
num_logprobs=10,
|
||||
),
|
||||
"glm4v": VLMTestInfo(
|
||||
models=["zai-org/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
|
||||
Reference in New Issue
Block a user