[CI/Build] Add Model Tests for Qwen2-VL (#9846)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Alex Brooks
2024-10-31 10:10:52 -06:00
committed by GitHub
parent 5608e611c2
commit 16b8f7a86f
9 changed files with 106 additions and 52 deletions

View File

@@ -75,6 +75,63 @@ COMMON_BROADCAST_SETTINGS = {
# this is a good idea for checking your command first, since tests are slow.
VLM_TEST_SETTINGS = {
#### Core tests to always run in the CI
"llava": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"],
test_type=(
VLMTestType.EMBEDDING,
VLMTestType.IMAGE,
VLMTestType.CUSTOM_INPUTS
),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
),
limit_mm_per_prompt={"image": 4},
)],
marks=[pytest.mark.core_model],
),
"paligemma": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt = lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}),
auto_cls=AutoModelForVision2Seq,
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="half" if current_platform.is_rocm() else ("half", "float"),
marks=[pytest.mark.core_model],
),
"qwen2_vl": VLMTestInfo(
models=["Qwen/Qwen2-VL-2B-Instruct"],
test_type=(
VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE,
VLMTestType.VIDEO
),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
marks=[pytest.mark.core_model],
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
#### Extended model tests
"blip2": VLMTestInfo(
models=["Salesforce/blip2-opt-2.7b"],
test_type=VLMTestType.IMAGE,
@@ -151,25 +208,6 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
"llava": VLMTestInfo(
models=["llava-hf/llava-1.5-7b-hf"],
test_type=(
VLMTestType.EMBEDDING,
VLMTestType.IMAGE,
VLMTestType.CUSTOM_INPUTS
),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
convert_assets_to_embeddings=model_utils.get_llava_embeddings,
max_model_len=4096,
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_image_vllm_to_hf_output,
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_image_multi_aspect_ratio_inputs(
formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:"
),
limit_mm_per_prompt={"image": 4},
)],
),
"llava_next": VLMTestInfo(
models=["llava-hf/llava-v1.6-mistral-7b-hf"],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
@@ -200,12 +238,12 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
# Llava-one-vision tests fixed sizes & the default size factors
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
runner_mm_key="videos",
custom_test_opts=[CustomTestOptions(
inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs(
formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
),
limit_mm_per_prompt={"video": 4},
runner_mm_key="videos",
)],
),
# FIXME
@@ -218,9 +256,11 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForVision2Seq,
vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output,
image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
runner_mm_key="videos",
marks=[
pytest.mark.skip(reason="LLava next video tests currently fail.")
pytest.mark.skipif(
transformers.__version__.startswith("4.46"),
reason="Model broken with changes in transformers 4.46"
)
],
),
"minicpmv": VLMTestInfo(
@@ -234,23 +274,6 @@ VLM_TEST_SETTINGS = {
postprocess_inputs=model_utils.wrap_inputs_post_processor,
hf_output_post_proc=model_utils.minicmpv_trunc_hf_output,
),
"paligemma": VLMTestInfo(
models=["google/paligemma-3b-mix-224"],
test_type=VLMTestType.IMAGE,
prompt_formatter=identity,
img_idx_to_prompt = lambda idx: "",
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts=IMAGE_ASSETS.prompts({
"stop_sign": "caption es",
"cherry_blossom": "What is in the picture?",
}),
auto_cls=AutoModelForVision2Seq,
postprocess_inputs=model_utils.get_key_type_post_processor(
"pixel_values"
),
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="half" if current_platform.is_rocm() else ("half", "float"),
),
# Tests for phi3v currently live in another file because of a bug in
# transformers. Once this issue is fixed, we can enable them here instead.
# https://github.com/huggingface/transformers/issues/34307