Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com> Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com>
This commit is contained in:
@@ -112,6 +112,25 @@ VLM_TEST_SETTINGS = {
|
||||
vllm_runner_kwargs={"enable_mm_embeds": True},
|
||||
marks=[pytest.mark.core_model, pytest.mark.cpu_model],
|
||||
),
|
||||
"paligemma": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
# Paligemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
dtype="bfloat16",
|
||||
marks=[
|
||||
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
|
||||
],
|
||||
),
|
||||
"qwen2_5_vl": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
|
||||
@@ -176,24 +195,14 @@ VLM_TEST_SETTINGS = {
|
||||
# Gemma3 has bidirectional mask on images
|
||||
"gemma3-transformers": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=8192,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=lambda vid_prompt: f"<'<bos><start_of_turn>user\n{vid_prompt}<start_of_image><end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
max_model_len=4096,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
# TODO: Support `do_pan_and_scan` in transformers backend
|
||||
# patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
vllm_output_post_proc=model_utils.gemma3_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
# "mm_processor_kwargs": {"do_pan_and_scan": True},
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
@@ -212,27 +221,6 @@ VLM_TEST_SETTINGS = {
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# PaliGemma has PrefixLM attention
|
||||
"paligemma-transformers": VLMTestInfo(
|
||||
models=["google/paligemma-3b-mix-224"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
prompt_formatter=identity,
|
||||
img_idx_to_prompt=lambda idx: "",
|
||||
# PaliGemma uses its own sample prompts because the default one fails
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "caption es",
|
||||
"cherry_blossom": "What is in the picture?",
|
||||
}
|
||||
),
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
|
||||
image_size_factors=[(0.25, 0.5, 1.0)],
|
||||
vllm_runner_kwargs={
|
||||
"model_impl": "transformers",
|
||||
},
|
||||
marks=[pytest.mark.core_model],
|
||||
),
|
||||
# Pixel values from processor are not 4D or 5D arrays
|
||||
"qwen2_5_vl-transformers": VLMTestInfo(
|
||||
models=["Qwen/Qwen2.5-VL-3B-Instruct"],
|
||||
@@ -359,6 +347,24 @@ VLM_TEST_SETTINGS = {
|
||||
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
|
||||
marks=[large_gpu_mark(min_gb=32)],
|
||||
),
|
||||
"gemma3": VLMTestInfo(
|
||||
models=["google/gemma-3-4b-it"],
|
||||
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
|
||||
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
|
||||
single_image_prompts=IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
|
||||
"cherry_blossom": "<start_of_image>What is the season?",
|
||||
}
|
||||
),
|
||||
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
|
||||
max_model_len=4096,
|
||||
max_num_seqs=2,
|
||||
auto_cls=AutoModelForImageTextToText,
|
||||
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
|
||||
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
|
||||
num_logprobs=10,
|
||||
),
|
||||
"glm4v": VLMTestInfo(
|
||||
models=["zai-org/glm-4v-9b"],
|
||||
test_type=VLMTestType.IMAGE,
|
||||
|
||||
Reference in New Issue
Block a user