[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-05-07 00:12:28 +08:00
committed by GitHub
parent 7525d5f3d5
commit 5b8c390747

View File

@@ -45,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16", dtype="bfloat16",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
@@ -71,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True}, mm_processor_kwargs={"crop_to_patches": True},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
@@ -92,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
prompts = [f"Question: {question} Answer:" for question in questions] prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b", model="Salesforce/blip2-opt-6.7b",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -110,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
model="facebook/chameleon-7b", model="facebook/chameleon-7b",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -130,7 +130,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
@@ -155,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions] prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
@@ -175,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
model="adept/fuyu-8b", model="adept/fuyu-8b",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -194,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
mm_processor_kwargs={"do_pan_and_scan": True}, mm_processor_kwargs={"do_pan_and_scan": True},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [("<bos><start_of_turn>user\n" prompts = [("<bos><start_of_turn>user\n"
@@ -219,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True, enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}, hf_overrides={"architectures": ["GLM4VForCausalLM"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
@@ -246,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -287,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 3 * 364 "longest_edge": 3 * 364
}, },
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [( prompts = [(
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:" f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
@@ -314,7 +314,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 384 "longest_edge": 384
}, },
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:") (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
@@ -337,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -378,7 +378,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
model="moonshotai/Kimi-VL-A3B-Instruct", model="moonshotai/Kimi-VL-A3B-Instruct",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -398,7 +398,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -415,7 +415,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-v1.6-mistral-7b-hf", model="llava-hf/llava-v1.6-mistral-7b-hf",
max_model_len=8192, max_model_len=8192,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -437,7 +437,7 @@ def run_llava_next_video(questions: list[str],
model="llava-hf/LLaVA-NeXT-Video-7B-hf", model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -465,7 +465,7 @@ def run_llava_onevision(questions: list[str],
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf", model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384, max_model_len=16384,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -488,7 +488,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
model="TIGER-Lab/Mantis-8B-siglip-llama3", model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096, max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
stop_token_ids = [128009] stop_token_ids = [128009]
@@ -529,7 +529,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
# NOTE The stop_token_ids are different for various versions of MiniCPM-V # NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0 # 2.0
@@ -584,7 +584,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
tensor_parallel_size=2, tensor_parallel_size=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions] prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -610,7 +610,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -645,7 +645,7 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=4, max_num_seqs=4,
tensor_parallel_size=8, tensor_parallel_size=8,
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -680,7 +680,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [ prompts = [
@@ -706,7 +706,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
tensor_parallel_size=4, tensor_parallel_size=4,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
@@ -738,7 +738,7 @@ def run_ovis2(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
dtype="half", dtype="half",
hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]}, hf_overrides={"architectures": ["Ovis2ForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
placeholder = "<image>\n" placeholder = "<image>\n"
@@ -761,7 +761,7 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions] prompts = ["caption en" for _ in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="google/paligemma-3b-mix-224", model="google/paligemma-3b-mix-224",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -778,7 +778,7 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions] prompts = ["caption en" for _ in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="google/paligemma2-3b-ft-docci-448", model="google/paligemma2-3b-ft-docci-448",
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -815,7 +815,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2, max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls # Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16}, mm_processor_kwargs={"num_crops": 16},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -849,7 +849,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
max_lora_rank=320, max_lora_rank=320,
# Note - mm_processor_kwargs can also be passed to generate/chat calls # Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"dynamic_hd": 16}, mm_processor_kwargs={"dynamic_hd": 16},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
return ModelRequestData( return ModelRequestData(
@@ -870,7 +870,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
max_model_len=6144, max_model_len=6144,
max_num_seqs=2, max_num_seqs=2,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions] prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
@@ -891,7 +891,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=1024, max_model_len=1024,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions] prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
@@ -916,7 +916,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28, "min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
if modality == "image": if modality == "image":
@@ -951,7 +951,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
if modality == "image": if modality == "image":
@@ -985,7 +985,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": [1], "fps": [1],
}, },
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
if modality == "image": if modality == "image":
@@ -1018,7 +1018,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={modality: 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,