2025-02-02 14:58:18 -05:00
# SPDX-License-Identifier: Apache-2.0
2025-06-03 11:20:17 -07:00
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
2024-09-05 18:51:53 +08:00
"""
This example shows how to use vLLM for running offline inference with
2024-10-23 11:35:29 +08:00
multi - image input on vision language models for text generation ,
using the chat template defined by the model .
2024-09-05 18:51:53 +08:00
"""
2025-05-26 17:57:54 +01:00
2025-03-08 01:28:52 +08:00
import os
2024-09-05 18:51:53 +08:00
from argparse import Namespace
2025-03-17 18:00:17 +08:00
from dataclasses import asdict
2025-03-03 01:34:51 +00:00
from typing import NamedTuple
2024-09-05 18:51:53 +08:00
2025-03-08 01:28:52 +08:00
from huggingface_hub import snapshot_download
2024-09-22 06:56:20 -06:00
from PIL . Image import Image
2024-09-12 00:31:19 +08:00
from transformers import AutoProcessor , AutoTokenizer
2024-09-07 16:38:23 +08:00
2025-03-17 18:00:17 +08:00
from vllm import LLM , EngineArgs , SamplingParams
2025-03-08 01:28:52 +08:00
from vllm . lora . request import LoRARequest
2024-09-05 18:51:53 +08:00
from vllm . multimodal . utils import fetch_image
2025-10-26 16:33:32 +05:30
from vllm . utils . argparse_utils import FlexibleArgumentParser
2024-09-05 18:51:53 +08:00
QUESTION = " What is the content of each image? "
IMAGE_URLS = [
2025-11-08 17:58:26 -08:00
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg " ,
" https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg " ,
2024-09-05 18:51:53 +08:00
]
2024-09-22 06:56:20 -06:00
class ModelRequestData ( NamedTuple ) :
2025-03-17 18:00:17 +08:00
engine_args : EngineArgs
2024-09-22 06:56:20 -06:00
prompt : str
2025-03-03 01:34:51 +00:00
image_data : list [ Image ]
2025-03-17 18:00:17 +08:00
stop_token_ids : list [ int ] | None = None
chat_template : str | None = None
lora_requests : list [ LoRARequest ] | None = None
2025-10-23 08:15:38 +08:00
sampling_params : SamplingParams | None = None
2024-09-22 06:56:20 -06:00
2024-09-29 00:54:35 +08:00
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
2025-03-17 18:00:17 +08:00
def load_aria ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2024-12-16 19:23:33 +08:00
model_name = " rhymes-ai/Aria "
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
model = model_name ,
tokenizer_mode = " slow " ,
trust_remote_code = True ,
dtype = " bfloat16 " ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
2024-12-16 19:23:33 +08:00
placeholders = " <fim_prefix><|img|><fim_suffix> \n " * len ( image_urls )
2025-05-26 17:57:54 +01:00
prompt = (
f " <|im_start|>user \n { placeholders } { question } <|im_end|> \n <|im_start|>assistant \n "
)
2024-12-16 19:23:33 +08:00
stop_token_ids = [ 93532 , 93653 , 944 , 93421 , 1019 , 93653 , 93519 ]
2025-01-08 10:17:16 +08:00
2024-09-22 06:56:20 -06:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-09-22 06:56:20 -06:00
prompt = prompt ,
stop_token_ids = stop_token_ids ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
2025-01-08 10:17:16 +08:00
)
2024-09-05 18:51:53 +08:00
2024-09-07 16:38:23 +08:00
2025-04-01 09:30:43 -07:00
def load_aya_vision ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-12-05 14:41:40 -05:00
model_name = " CohereLabs/aya-vision-8b "
2025-04-01 09:30:43 -07:00
engine_args = EngineArgs (
model = model_name ,
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
2025-05-26 17:57:54 +01:00
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
2025-04-01 09:30:43 -07:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-05-26 17:57:54 +01:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-04-01 09:30:43 -07:00
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-10-20 10:31:26 +08:00
def load_bee ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " Open-Bee/Bee-8B-RL "
engine_args = EngineArgs (
model = model_name ,
max_model_len = 16384 ,
max_num_seqs = 16 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
trust_remote_code = True ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
processor = AutoProcessor . from_pretrained ( model_name , trust_remote_code = True )
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-08-12 04:39:54 -04:00
def load_command_a_vision ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " CohereLabs/command-a-vision-07-2025 "
# NOTE: This model is 122B parameters and requires tensor parallelism
# Recommended to use tp=4 on H100 GPUs
engine_args = EngineArgs (
model = model_name ,
max_model_len = 32768 ,
tensor_parallel_size = 4 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
processor = AutoProcessor . from_pretrained ( model_name )
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-05-26 17:57:54 +01:00
def load_deepseek_vl2 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-01-17 01:14:48 +08:00
model_name = " deepseek-ai/deepseek-vl2-tiny "
2025-01-12 16:17:24 +08:00
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
model = model_name ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
hf_overrides = { " architectures " : [ " DeepseekVLV2ForCausalLM " ] } ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
2025-01-12 16:17:24 +08:00
2025-05-26 17:57:54 +01:00
placeholder = " " . join (
f " image_ { i } :<image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
2025-01-12 16:17:24 +08:00
prompt = f " <|User|>: { placeholder } { question } \n \n <|Assistant|>: "
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2025-01-12 16:17:24 +08:00
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-10-23 08:15:38 +08:00
def load_deepseek_ocr ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
from vllm . model_executor . models . deepseek_ocr import NGramPerReqLogitsProcessor
model_name = " deepseek-ai/DeepSeek-OCR "
engine_args = EngineArgs (
model = model_name ,
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
logits_processors = [ NGramPerReqLogitsProcessor ] ,
)
placeholder = " <image> \n " * len ( image_urls )
prompt = placeholder + question
# The following sampling params config is taken from
# the official Deepseek-OCR inference example.
# (IMPORTANT) Use the custom logits processor and avoid skipping
# special tokens for this model for the optimal OCR performance.
sampling_params = SamplingParams (
temperature = 0.0 ,
max_tokens = 8192 ,
# ngram logit processor args
extra_args = dict (
ngram_size = 30 ,
window_size = 90 ,
# whitelist: <td>, </td>
whitelist_token_ids = { 128821 , 128822 } ,
) ,
skip_special_tokens = False ,
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
sampling_params = sampling_params ,
)
2025-03-17 18:00:17 +08:00
def load_gemma3 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-03-12 08:36:33 -07:00
model_name = " google/gemma-3-4b-it "
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2025-03-13 17:23:12 +08:00
model = model_name ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
2025-03-12 08:36:33 -07:00
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
2025-05-26 17:57:54 +01:00
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
2025-03-12 08:36:33 -07:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-05-26 17:57:54 +01:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-03-12 08:36:33 -07:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2025-03-12 08:36:33 -07:00
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-03-03 01:34:51 +00:00
def load_h2ovl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-02-16 02:35:54 -08:00
model_name = " h2oai/h2ovl-mississippi-800m "
2024-11-03 18:15:36 -06:00
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2024-11-03 18:15:36 -06:00
model = model_name ,
trust_remote_code = True ,
max_model_len = 8192 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
2025-02-19 21:13:50 +08:00
mm_processor_kwargs = { " max_dynamic_patch " : 4 } ,
2024-11-03 18:15:36 -06:00
)
2025-05-26 17:57:54 +01:00
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
messages = [ { " role " : " user " , " content " : f " { placeholders } \n { question } " } ]
2024-11-03 18:15:36 -06:00
2025-05-26 17:57:54 +01:00
tokenizer = AutoTokenizer . from_pretrained ( model_name , trust_remote_code = True )
prompt = tokenizer . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2024-11-03 18:15:36 -06:00
# Stop tokens for H2OVL-Mississippi
2025-02-16 02:35:54 -08:00
# https://huggingface.co/h2oai/h2ovl-mississippi-800m
2024-11-03 18:15:36 -06:00
stop_token_ids = [ tokenizer . eos_token_id ]
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-11-03 18:15:36 -06:00
prompt = prompt ,
stop_token_ids = stop_token_ids ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-12-03 18:05:10 +08:00
# HunyuanOCR
def load_hunyuan_vl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " tencent/HunyuanOCR "
engine_args = EngineArgs (
model = model_name ,
max_model_len = 8192 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholder = (
" <| hy_place▁holder▁no▁100| ><| hy_place▁holder▁no▁102| ><| hy_place▁holder▁no▁101| > " # noqa: E501
) * len ( image_urls )
prompt = f " <| hy_begin▁of▁sentence| > { placeholder } { question } <| hy_User| > "
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-08-01 01:35:49 -07:00
def load_hyperclovax_seed_vision (
question : str , image_urls : list [ str ]
) - > ModelRequestData :
model_name = " naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B "
tokenizer = AutoTokenizer . from_pretrained ( model_name , trust_remote_code = True )
2024-12-16 19:23:33 +08:00
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2024-12-16 19:23:33 +08:00
model = model_name ,
2025-08-01 01:35:49 -07:00
trust_remote_code = True ,
max_model_len = 16384 ,
2024-12-16 19:23:33 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
2025-08-01 01:35:49 -07:00
message = { " role " : " user " , " content " : list ( ) }
for _image_url in image_urls :
message [ " content " ] . append (
{
" type " : " image " ,
" image " : _image_url ,
" ocr " : " " ,
" lens_keywords " : " " ,
" lens_local_keywords " : " " ,
}
)
message [ " content " ] . append (
{
" type " : " text " ,
" text " : question ,
}
2025-05-26 17:57:54 +01:00
)
2025-08-01 01:35:49 -07:00
prompt = tokenizer . apply_chat_template (
[
message ,
] ,
tokenize = False ,
add_generation_prompt = True ,
)
2024-12-16 19:23:33 +08:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-12-16 19:23:33 +08:00
prompt = prompt ,
2025-08-01 01:35:49 -07:00
stop_token_ids = None ,
2024-12-16 19:23:33 +08:00
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-08-01 01:35:49 -07:00
def load_idefics3 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " HuggingFaceM4/Idefics3-8B-Llama3 "
2025-04-09 10:12:17 +08:00
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs (
model = model_name ,
max_model_len = 8192 ,
max_num_seqs = 16 ,
enforce_eager = True ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
2025-08-01 01:35:49 -07:00
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
2025-04-09 10:12:17 +08:00
mm_processor_kwargs = {
2025-08-01 01:35:49 -07:00
" size " : { " longest_edge " : 2 * 364 } ,
2025-04-09 10:12:17 +08:00
} ,
)
2025-05-26 17:57:54 +01:00
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
2025-08-01 01:35:49 -07:00
prompt = f " <|begin_of_text|>User: { placeholders } \n { question } <end_of_utterance> \n Assistant: " # noqa: E501
2025-04-09 10:12:17 +08:00
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-07-26 19:14:04 +08:00
def load_interns1 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-10-03 16:59:06 +08:00
model_name = " internlm/Intern-S1-mini "
2025-07-26 19:14:04 +08:00
engine_args = EngineArgs (
model = model_name ,
trust_remote_code = True ,
max_model_len = 4096 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = " \n " . join (
f " Image- { i } : <IMG_CONTEXT> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
messages = [ { " role " : " user " , " content " : f " { placeholders } \n { question } " } ]
tokenizer = AutoTokenizer . from_pretrained ( model_name , trust_remote_code = True )
prompt = tokenizer . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-03-03 01:34:51 +00:00
def load_internvl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2024-09-07 16:38:23 +08:00
model_name = " OpenGVLab/InternVL2-2B "
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2024-09-07 16:38:23 +08:00
model = model_name ,
trust_remote_code = True ,
max_model_len = 4096 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
2024-09-30 13:01:20 +08:00
mm_processor_kwargs = { " max_dynamic_patch " : 4 } ,
2024-09-07 16:38:23 +08:00
)
2025-05-26 17:57:54 +01:00
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
messages = [ { " role " : " user " , " content " : f " { placeholders } \n { question } " } ]
2024-09-07 16:38:23 +08:00
2025-05-26 17:57:54 +01:00
tokenizer = AutoTokenizer . from_pretrained ( model_name , trust_remote_code = True )
prompt = tokenizer . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2024-09-07 16:38:23 +08:00
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
2024-12-08 00:58:02 +08:00
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
2024-09-07 16:38:23 +08:00
stop_tokens = [ " <|endoftext|> " , " <|im_start|> " , " <|im_end|> " , " <|end|> " ]
stop_token_ids = [ tokenizer . convert_tokens_to_ids ( i ) for i in stop_tokens ]
2024-09-12 00:31:19 +08:00
2024-09-22 06:56:20 -06:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-09-22 06:56:20 -06:00
prompt = prompt ,
stop_token_ids = stop_token_ids ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2024-09-12 00:31:19 +08:00
2025-10-06 17:45:26 +08:00
def load_keye_vl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " Kwai-Keye/Keye-VL-8B-Preview "
2025-07-25 22:05:42 +09:00
2025-06-17 15:05:21 +08:00
engine_args = EngineArgs (
model = model_name ,
2025-10-06 17:45:26 +08:00
trust_remote_code = True ,
max_model_len = 8192 ,
max_num_seqs = 5 ,
2025-06-17 15:05:21 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
2025-10-06 17:45:26 +08:00
} ,
2025-06-17 15:05:21 +08:00
]
2025-10-06 17:45:26 +08:00
processor = AutoProcessor . from_pretrained ( model_name , trust_remote_code = True )
2025-06-17 15:05:21 +08:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-10-06 17:45:26 +08:00
image_data = [ fetch_image ( url ) for url in image_urls ]
2025-06-17 15:05:21 +08:00
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
2025-10-06 17:45:26 +08:00
image_data = image_data ,
2025-06-17 15:05:21 +08:00
)
2025-10-06 17:45:26 +08:00
def load_keye_vl1_5 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " Kwai-Keye/Keye-VL-1_5-8B "
2025-06-17 15:05:21 +08:00
engine_args = EngineArgs (
model = model_name ,
2025-10-06 17:45:26 +08:00
trust_remote_code = True ,
max_model_len = 32768 ,
max_num_seqs = 5 ,
2025-06-17 15:05:21 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
2025-10-06 17:45:26 +08:00
} ,
2025-06-17 15:05:21 +08:00
]
2025-10-06 17:45:26 +08:00
processor = AutoProcessor . from_pretrained ( model_name , trust_remote_code = True )
2025-06-17 15:05:21 +08:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-10-06 17:45:26 +08:00
image_data = [ fetch_image ( url ) for url in image_urls ]
2025-06-17 15:05:21 +08:00
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
2025-10-06 17:45:26 +08:00
image_data = image_data ,
2025-06-17 15:05:21 +08:00
)
2025-10-06 17:45:26 +08:00
def load_kimi_vl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " moonshotai/Kimi-VL-A3B-Instruct "
2025-06-17 15:05:21 +08:00
engine_args = EngineArgs (
model = model_name ,
2025-10-06 17:45:26 +08:00
trust_remote_code = True ,
max_model_len = 4096 ,
max_num_seqs = 4 ,
2025-06-17 15:05:21 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
2025-10-06 17:45:26 +08:00
processor = AutoProcessor . from_pretrained ( model_name , trust_remote_code = True )
2025-06-17 15:05:21 +08:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-10-06 17:45:26 +08:00
def load_llama4 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " meta-llama/Llama-4-Scout-17B-16E-Instruct "
2025-04-07 08:06:27 -07:00
engine_args = EngineArgs (
model = model_name ,
2025-10-06 17:45:26 +08:00
max_model_len = 131072 ,
tensor_parallel_size = 8 ,
2025-04-07 08:06:27 -07:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
2025-05-26 17:57:54 +01:00
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
2025-04-07 08:06:27 -07:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-05-26 17:57:54 +01:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-04-07 08:06:27 -07:00
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-10-06 17:45:26 +08:00
def load_llava ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
# it will generate poor response for multi-image inputs!
model_name = " llava-hf/llava-1.5-7b-hf "
2025-07-02 14:35:04 +08:00
engine_args = EngineArgs (
model = model_name ,
2025-10-06 17:45:26 +08:00
max_num_seqs = 16 ,
2025-07-02 14:35:04 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
2025-10-06 17:45:26 +08:00
}
2025-07-02 14:35:04 +08:00
]
2025-10-06 17:45:26 +08:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-07-02 14:35:04 +08:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
2025-10-06 17:45:26 +08:00
image_data = [ fetch_image ( url ) for url in image_urls ] ,
2025-07-02 14:35:04 +08:00
)
2025-10-06 17:45:26 +08:00
def load_llava_next ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " llava-hf/llava-v1.6-mistral-7b-hf "
2025-09-01 18:50:27 +08:00
engine_args = EngineArgs (
model = model_name ,
2025-10-06 17:45:26 +08:00
max_model_len = 8192 ,
max_num_seqs = 16 ,
2025-09-01 18:50:27 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
2025-10-06 17:45:26 +08:00
}
2025-09-01 18:50:27 +08:00
]
2025-10-06 17:45:26 +08:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-09-01 18:50:27 +08:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
2025-10-06 17:45:26 +08:00
image_data = [ fetch_image ( url ) for url in image_urls ] ,
2025-09-01 18:50:27 +08:00
)
2025-10-06 17:45:26 +08:00
def load_llava_onevision ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " llava-hf/llava-onevision-qwen2-7b-ov-hf "
2025-04-15 05:41:48 +08:00
engine_args = EngineArgs (
model = model_name ,
2025-10-06 17:45:26 +08:00
max_model_len = 16384 ,
max_num_seqs = 16 ,
2025-04-15 05:41:48 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
2025-05-26 17:57:54 +01:00
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
2025-04-15 05:41:48 +08:00
2025-10-06 17:45:26 +08:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-04-15 05:41:48 +08:00
2025-05-26 17:57:54 +01:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-04-15 05:41:48 +08:00
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-04-01 07:10:05 -06:00
def load_mistral3 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " mistralai/Mistral-Small-3.1-24B-Instruct-2503 "
# Adjust this as necessary to fit in GPU
engine_args = EngineArgs (
model = model_name ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
tensor_parallel_size = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
2025-07-04 10:55:07 +08:00
ignore_patterns = [ " consolidated.safetensors " ] ,
2025-04-01 07:10:05 -06:00
)
placeholders = " [IMG] " * len ( image_urls )
prompt = f " <s>[INST] { question } \n { placeholders } [/INST] "
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-03-17 18:00:17 +08:00
def load_nvlm_d ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2024-10-07 19:55:12 +08:00
model_name = " nvidia/NVLM-D-72B "
# Adjust this as necessary to fit in GPU
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2024-10-07 19:55:12 +08:00
model = model_name ,
trust_remote_code = True ,
max_model_len = 8192 ,
tensor_parallel_size = 4 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
mm_processor_kwargs = { " max_dynamic_patch " : 4 } ,
)
2025-05-26 17:57:54 +01:00
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
messages = [ { " role " : " user " , " content " : f " { placeholders } \n { question } " } ]
2024-10-07 19:55:12 +08:00
2025-05-26 17:57:54 +01:00
tokenizer = AutoTokenizer . from_pretrained ( model_name , trust_remote_code = True )
prompt = tokenizer . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2024-10-07 19:55:12 +08:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-10-07 19:55:12 +08:00
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-05-12 08:56:30 +08:00
# Ovis
def load_ovis ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-04-30 09:33:29 +02:00
model_name = " AIDC-AI/Ovis2-1B "
engine_args = EngineArgs (
model = model_name ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
trust_remote_code = True ,
dtype = " half " ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
2025-05-26 17:57:54 +01:00
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
messages = [ { " role " : " user " , " content " : f " { placeholders } \n { question } " } ]
2025-05-12 08:56:30 +08:00
2025-05-26 17:57:54 +01:00
tokenizer = AutoTokenizer . from_pretrained ( model_name , trust_remote_code = True )
prompt = tokenizer . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-04-30 09:33:29 +02:00
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-08-19 21:12:59 +08:00
# ovis2_5
def load_ovis2_5 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " AIDC-AI/Ovis2.5-2B "
engine_args = EngineArgs (
model = model_name ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
trust_remote_code = True ,
dtype = " half " ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
2025-10-07 20:54:22 +08:00
prompt = (
f " <|im_start|>user \n \n { placeholders } \n { question } <|im_end|> \n "
" <|im_start|>assistant \n "
2025-08-19 21:12:59 +08:00
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-11-03 19:04:22 +08:00
def load_paddleocr_vl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " PaddlePaddle/PaddleOCR-VL "
engine_args = EngineArgs (
model = model_name ,
trust_remote_code = True ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = " <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|> " * len ( image_urls )
prompt = f " <|begin_of_sentence|>User: { question } { placeholders } \n Assistant: "
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-03-03 01:34:51 +00:00
def load_pixtral_hf ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-01-08 10:17:16 +08:00
model_name = " mistral-community/pixtral-12b "
# Adjust this as necessary to fit in GPU
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2025-01-08 10:17:16 +08:00
model = model_name ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
tensor_parallel_size = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = " [IMG] " * len ( image_urls )
prompt = f " <s>[INST] { question } \n { placeholders } [/INST] "
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2025-01-08 10:17:16 +08:00
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-03-03 01:34:51 +00:00
def load_phi3v ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2024-12-16 19:23:33 +08:00
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance, because it may scale the image more in
# the image preprocessing. Some references in the model docs and the
# formula for image tokens after the preprocessing
# transform can be found below.
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2024-12-16 19:23:33 +08:00
model = " microsoft/Phi-3.5-vision-instruct " ,
trust_remote_code = True ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
mm_processor_kwargs = { " num_crops " : 4 } ,
)
2025-05-26 17:57:54 +01:00
placeholders = " \n " . join (
f " <|image_ { i } |> " for i , _ in enumerate ( image_urls , start = 1 )
)
2024-12-16 19:23:33 +08:00
prompt = f " <|user|> \n { placeholders } \n { question } <|end|> \n <|assistant|> \n "
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-12-16 19:23:33 +08:00
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-03-08 01:28:52 +08:00
def load_phi4mm ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
"""
Phi - 4 - multimodal - instruct supports both image and audio inputs . Here , we
show how to process multi images inputs .
"""
model_path = snapshot_download ( " microsoft/Phi-4-multimodal-instruct " )
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os . path . join ( model_path , " vision-lora " )
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2025-03-08 01:28:52 +08:00
model = model_path ,
trust_remote_code = True ,
2025-04-19 17:26:11 +08:00
max_model_len = 4096 ,
2025-03-08 01:28:52 +08:00
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
enable_lora = True ,
max_lora_rank = 320 ,
2025-04-19 17:26:11 +08:00
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs = { " dynamic_hd " : 4 } ,
2025-03-08 01:28:52 +08:00
)
2025-05-26 17:57:54 +01:00
placeholders = " " . join ( f " <|image_ { i } |> " for i , _ in enumerate ( image_urls , start = 1 ) )
2025-03-08 01:28:52 +08:00
prompt = f " <|user|> { placeholders } { question } <|end|><|assistant|> "
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2025-03-08 01:28:52 +08:00
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
2025-03-17 18:00:17 +08:00
lora_requests = [ LoRARequest ( " vision " , 1 , vision_lora_path ) ] ,
2025-03-08 01:28:52 +08:00
)
2025-05-26 17:57:54 +01:00
def load_qwen_vl_chat ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2024-12-16 19:23:33 +08:00
model_name = " Qwen/Qwen-VL-Chat "
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2024-12-16 19:23:33 +08:00
model = model_name ,
trust_remote_code = True ,
max_model_len = 1024 ,
max_num_seqs = 2 ,
2025-02-13 22:19:15 +08:00
hf_overrides = { " architectures " : [ " QwenVLForConditionalGeneration " ] } ,
2024-12-16 19:23:33 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
2025-05-26 17:57:54 +01:00
placeholders = " " . join (
f " Picture { i } : <img></img> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
2024-12-16 19:23:33 +08:00
# This model does not have a chat_template attribute on its tokenizer,
# so we need to explicitly pass it. We use ChatML since it's used in the
# generation utils of the model:
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
2025-05-26 17:57:54 +01:00
tokenizer = AutoTokenizer . from_pretrained ( model_name , trust_remote_code = True )
2024-12-16 19:23:33 +08:00
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
chat_template = " { % i f not add_generation_prompt is defined % } { % s et add_generation_prompt = false % } { % e ndif % } { % f or message in messages % } {{ ' <|im_start|> ' + message[ ' role ' ] + ' \n ' + message[ ' content ' ] + ' <|im_end|> ' + ' \n ' }} { % e ndfor % } { % i f add_generation_prompt % } {{ ' <|im_start|>assistant \n ' }} { % e ndif % } " # noqa: E501
2025-05-26 17:57:54 +01:00
messages = [ { " role " : " user " , " content " : f " { placeholders } \n { question } " } ]
prompt = tokenizer . apply_chat_template (
messages ,
tokenize = False ,
add_generation_prompt = True ,
chat_template = chat_template ,
)
2024-12-16 19:23:33 +08:00
stop_tokens = [ " <|endoftext|> " , " <|im_start|> " , " <|im_end|> " ]
stop_token_ids = [ tokenizer . convert_tokens_to_ids ( i ) for i in stop_tokens ]
2025-01-08 10:17:16 +08:00
2024-12-16 19:23:33 +08:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-12-16 19:23:33 +08:00
prompt = prompt ,
stop_token_ids = stop_token_ids ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
chat_template = chat_template ,
)
2025-03-17 18:00:17 +08:00
def load_qwen2_vl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2024-09-12 00:31:19 +08:00
try :
2025-06-04 16:42:06 +08:00
from qwen_vl_utils import smart_resize
2024-09-12 00:31:19 +08:00
except ModuleNotFoundError :
2025-05-26 17:57:54 +01:00
print (
" WARNING: `qwen-vl-utils` not installed, input images will not "
" be automatically resized. You can enable this functionality by "
" `pip install qwen-vl-utils`. "
)
2025-06-04 16:42:06 +08:00
smart_resize = None
2024-09-12 00:31:19 +08:00
model_name = " Qwen/Qwen2-VL-7B-Instruct "
2024-09-29 00:54:35 +08:00
# Tested on L40
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2024-09-12 00:31:19 +08:00
model = model_name ,
2025-06-04 16:42:06 +08:00
max_model_len = 32768 if smart_resize is None else 4096 ,
2024-09-29 00:54:35 +08:00
max_num_seqs = 5 ,
2024-09-12 00:31:19 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
2025-05-26 17:57:54 +01:00
messages = [
{ " role " : " system " , " content " : " You are a helpful assistant. " } ,
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
} ,
]
2024-09-12 00:31:19 +08:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-05-26 17:57:54 +01:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2024-09-12 00:31:19 +08:00
2025-06-04 16:42:06 +08:00
if smart_resize is None :
2024-09-12 00:31:19 +08:00
image_data = [ fetch_image ( url ) for url in image_urls ]
else :
2025-06-04 16:42:06 +08:00
def post_process_image ( image : Image ) - > Image :
width , height = image . size
resized_height , resized_width = smart_resize (
height , width , max_pixels = 1024 * 28 * 28
)
return image . resize ( ( resized_width , resized_height ) )
image_data = [ post_process_image ( fetch_image ( url ) ) for url in image_urls ]
2024-09-12 00:31:19 +08:00
2024-09-22 06:56:20 -06:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2024-09-22 06:56:20 -06:00
prompt = prompt ,
image_data = image_data ,
)
2024-09-07 16:38:23 +08:00
2025-03-17 18:00:17 +08:00
def load_qwen2_5_vl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
2025-02-05 13:31:38 -08:00
try :
2025-06-04 16:42:06 +08:00
from qwen_vl_utils import smart_resize
2025-02-05 13:31:38 -08:00
except ModuleNotFoundError :
2025-05-26 17:57:54 +01:00
print (
" WARNING: `qwen-vl-utils` not installed, input images will not "
" be automatically resized. You can enable this functionality by "
" `pip install qwen-vl-utils`. "
)
2025-06-04 16:42:06 +08:00
smart_resize = None
2025-02-05 13:31:38 -08:00
model_name = " Qwen/Qwen2.5-VL-3B-Instruct "
2025-03-17 18:00:17 +08:00
engine_args = EngineArgs (
2025-02-05 13:31:38 -08:00
model = model_name ,
2025-06-04 16:42:06 +08:00
max_model_len = 32768 if smart_resize is None else 4096 ,
2025-02-05 13:31:38 -08:00
max_num_seqs = 5 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
2025-05-26 17:57:54 +01:00
messages = [
{ " role " : " system " , " content " : " You are a helpful assistant. " } ,
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
} ,
]
2025-02-05 13:31:38 -08:00
processor = AutoProcessor . from_pretrained ( model_name )
2025-05-26 17:57:54 +01:00
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
2025-02-05 13:31:38 -08:00
2025-06-04 16:42:06 +08:00
if smart_resize is None :
2025-02-05 13:31:38 -08:00
image_data = [ fetch_image ( url ) for url in image_urls ]
else :
2025-06-04 16:42:06 +08:00
def post_process_image ( image : Image ) - > Image :
width , height = image . size
resized_height , resized_width = smart_resize (
height , width , max_pixels = 1024 * 28 * 28
)
return image . resize ( ( resized_width , resized_height ) )
image_data = [ post_process_image ( fetch_image ( url ) ) for url in image_urls ]
2025-02-05 13:31:38 -08:00
return ModelRequestData (
2025-03-17 18:00:17 +08:00
engine_args = engine_args ,
2025-02-05 13:31:38 -08:00
prompt = prompt ,
image_data = image_data ,
)
2025-08-21 12:08:52 +08:00
def load_r_vl ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " YannQi/R-4B "
engine_args = EngineArgs (
model = model_name ,
max_model_len = 16384 ,
max_num_seqs = 16 ,
2025-11-24 14:48:45 +08:00
trust_remote_code = True ,
2025-08-21 12:08:52 +08:00
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
processor = AutoProcessor . from_pretrained ( model_name , trust_remote_code = True )
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
2025-08-01 01:35:49 -07:00
def load_smolvlm ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " HuggingFaceTB/SmolVLM2-2.2B-Instruct "
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs (
model = model_name ,
max_model_len = 8192 ,
max_num_seqs = 16 ,
enforce_eager = True ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
mm_processor_kwargs = {
" max_image_size " : { " longest_edge " : 384 } ,
} ,
)
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
prompt = (
f " <|im_start|>User: { placeholders } \n { question } <end_of_utterance> \n Assistant: " # noqa: E501
)
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = [ fetch_image ( url ) for url in image_urls ] ,
)
def load_step3 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " stepfun-ai/step3-fp8 "
# NOTE: Below are verified configurations for step3-fp8
# on 8xH100 GPUs.
engine_args = EngineArgs (
model = model_name ,
max_num_batched_tokens = 4096 ,
gpu_memory_utilization = 0.85 ,
tensor_parallel_size = 8 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
reasoning_parser = " step3 " ,
)
prompt = (
" <| begin▁of▁sentence| > You are a helpful assistant. <|BOT|>user \n "
f " { ' <im_patch> ' * len ( image_urls ) } { question } <|EOT|><|BOT| "
" >assistant \n <think> \n "
)
image_data = [ fetch_image ( url ) for url in image_urls ]
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = image_data ,
)
2025-06-03 13:13:13 +08:00
def load_tarsier ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " omni-research/Tarsier-7b "
engine_args = EngineArgs (
model = model_name ,
trust_remote_code = True ,
max_model_len = 4096 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
)
prompt = f " USER: { ' <image> ' * len ( image_urls ) } \n { question } \n ASSISTANT: "
image_data = [ fetch_image ( url ) for url in image_urls ]
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = image_data ,
)
2025-06-21 12:01:51 +08:00
def load_tarsier2 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " omni-research/Tarsier2-Recap-7b "
engine_args = EngineArgs (
model = model_name ,
trust_remote_code = True ,
max_model_len = 32768 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
2025-12-02 21:49:44 +00:00
hf_overrides = {
" architectures " : [ " Tarsier2ForConditionalGeneration " ] ,
" model_type " : " tarsier2 " ,
} ,
2025-06-21 12:01:51 +08:00
)
prompt = (
" <|im_start|>system \n You are a helpful assistant.<|im_end|> \n "
f " <|im_start|>user \n <|vision_start|> { ' <|image_pad|> ' * len ( image_urls ) } "
f " <|vision_end|> { question } <|im_end|> \n "
" <|im_start|>assistant \n "
)
image_data = [ fetch_image ( url ) for url in image_urls ]
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = image_data ,
)
2025-08-19 16:56:31 +09:00
# GLM-4.5V
def load_glm4_5v ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " zai-org/GLM-4.5V "
engine_args = EngineArgs (
model = model_name ,
max_model_len = 32768 ,
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
enforce_eager = True ,
tensor_parallel_size = 4 ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
processor = AutoProcessor . from_pretrained ( model_name )
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
image_data = [ fetch_image ( url ) for url in image_urls ]
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = image_data ,
)
# GLM-4.5V-FP8
def load_glm4_5v_fp8 ( question : str , image_urls : list [ str ] ) - > ModelRequestData :
model_name = " zai-org/GLM-4.5V-FP8 "
engine_args = EngineArgs (
model = model_name ,
max_model_len = 32768 ,
max_num_seqs = 2 ,
limit_mm_per_prompt = { " image " : len ( image_urls ) } ,
enforce_eager = True ,
tensor_parallel_size = 4 ,
)
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : question } ,
] ,
}
]
processor = AutoProcessor . from_pretrained ( model_name )
prompt = processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
image_data = [ fetch_image ( url ) for url in image_urls ]
return ModelRequestData (
engine_args = engine_args ,
prompt = prompt ,
image_data = image_data ,
)
2024-09-07 16:38:23 +08:00
model_example_map = {
2024-12-16 19:23:33 +08:00
" aria " : load_aria ,
2025-04-01 09:30:43 -07:00
" aya_vision " : load_aya_vision ,
2025-10-20 10:31:26 +08:00
" bee " : load_bee ,
2025-08-12 04:39:54 -04:00
" command_a_vision " : load_command_a_vision ,
2025-01-18 13:59:39 +08:00
" deepseek_vl_v2 " : load_deepseek_vl2 ,
2025-10-23 08:15:38 +08:00
" deepseek_ocr " : load_deepseek_ocr ,
2025-03-12 08:36:33 -07:00
" gemma3 " : load_gemma3 ,
2025-02-13 22:19:15 +08:00
" h2ovl_chat " : load_h2ovl ,
2025-12-03 18:05:10 +08:00
" hunyuan_vl " : load_hunyuan_vl ,
2025-08-01 01:35:49 -07:00
" hyperclovax_seed_vision " : load_hyperclovax_seed_vision ,
2024-12-16 19:23:33 +08:00
" idefics3 " : load_idefics3 ,
2025-07-26 19:14:04 +08:00
" interns1 " : load_interns1 ,
2024-09-07 16:38:23 +08:00
" internvl_chat " : load_internvl ,
2025-07-02 14:35:04 +08:00
" keye_vl " : load_keye_vl ,
2025-09-01 18:50:27 +08:00
" keye_vl1_5 " : load_keye_vl1_5 ,
2025-04-15 05:41:48 +08:00
" kimi_vl " : load_kimi_vl ,
2025-08-01 01:35:49 -07:00
" llama4 " : load_llama4 ,
2025-06-17 15:05:21 +08:00
" llava " : load_llava ,
" llava-next " : load_llava_next ,
" llava-onevision " : load_llava_onevision ,
2025-04-01 07:10:05 -06:00
" mistral3 " : load_mistral3 ,
2024-10-07 19:55:12 +08:00
" NVLM_D " : load_nvlm_d ,
2025-05-12 08:56:30 +08:00
" ovis " : load_ovis ,
2025-08-19 21:12:59 +08:00
" ovis2_5 " : load_ovis2_5 ,
2025-11-03 19:04:22 +08:00
" paddleocr_vl " : load_paddleocr_vl ,
2024-12-16 19:23:33 +08:00
" phi3_v " : load_phi3v ,
2025-03-08 01:28:52 +08:00
" phi4_mm " : load_phi4mm ,
2025-01-08 10:17:16 +08:00
" pixtral_hf " : load_pixtral_hf ,
" qwen_vl_chat " : load_qwen_vl_chat ,
2024-12-16 19:23:33 +08:00
" qwen2_vl " : load_qwen2_vl ,
2025-02-05 13:31:38 -08:00
" qwen2_5_vl " : load_qwen2_5_vl ,
2025-08-21 12:08:52 +08:00
" rvl " : load_r_vl ,
2025-04-09 10:12:17 +08:00
" smolvlm " : load_smolvlm ,
2025-08-01 01:35:49 -07:00
" step3 " : load_step3 ,
2025-06-03 13:13:13 +08:00
" tarsier " : load_tarsier ,
2025-06-21 12:01:51 +08:00
" tarsier2 " : load_tarsier2 ,
2025-08-19 16:56:31 +09:00
" glm4_5v " : load_glm4_5v ,
" glm4_5v_fp8 " : load_glm4_5v_fp8 ,
2024-09-07 16:38:23 +08:00
}
2025-11-25 14:03:20 +08:00
def run_generate (
model ,
question : str ,
image_urls : list [ str ] ,
2025-12-11 11:59:39 +08:00
seed : int ,
2025-11-25 14:03:20 +08:00
tensor_parallel_size : int | None ,
) :
2024-09-22 06:56:20 -06:00
req_data = model_example_map [ model ] ( question , image_urls )
2024-09-07 16:38:23 +08:00
2025-11-25 14:03:20 +08:00
engine_args = asdict ( req_data . engine_args ) | { " seed " : seed }
if tensor_parallel_size is not None :
engine_args [ " tensor_parallel_size " ] = tensor_parallel_size
2025-03-17 18:00:17 +08:00
llm = LLM ( * * engine_args )
2025-05-26 17:57:54 +01:00
sampling_params = SamplingParams (
temperature = 0.0 , max_tokens = 256 , stop_token_ids = req_data . stop_token_ids
)
2024-09-07 16:38:23 +08:00
2025-03-17 18:00:17 +08:00
outputs = llm . generate (
2024-09-07 16:38:23 +08:00
{
2024-09-22 06:56:20 -06:00
" prompt " : req_data . prompt ,
2025-05-26 17:57:54 +01:00
" multi_modal_data " : { " image " : req_data . image_data } ,
2024-09-05 18:51:53 +08:00
} ,
2025-04-11 12:57:16 +08:00
sampling_params = sampling_params ,
lora_request = req_data . lora_requests ,
)
2024-09-05 18:51:53 +08:00
2025-04-08 18:42:32 +08:00
print ( " - " * 50 )
2024-09-05 18:51:53 +08:00
for o in outputs :
generated_text = o . outputs [ 0 ] . text
print ( generated_text )
2025-04-08 18:42:32 +08:00
print ( " - " * 50 )
2024-09-05 18:51:53 +08:00
2025-11-25 14:03:20 +08:00
def run_chat (
model : str ,
question : str ,
image_urls : list [ str ] ,
2025-12-11 11:59:39 +08:00
seed : int ,
2025-11-25 14:03:20 +08:00
tensor_parallel_size : int | None ,
) :
2024-09-22 06:56:20 -06:00
req_data = model_example_map [ model ] ( question , image_urls )
2024-09-07 16:38:23 +08:00
2025-04-12 16:52:39 +08:00
# Disable other modalities to save memory
default_limits = { " image " : 0 , " video " : 0 , " audio " : 0 }
req_data . engine_args . limit_mm_per_prompt = default_limits | dict (
2025-05-26 17:57:54 +01:00
req_data . engine_args . limit_mm_per_prompt or { }
)
2025-04-12 16:52:39 +08:00
2025-03-17 18:00:17 +08:00
engine_args = asdict ( req_data . engine_args ) | { " seed " : seed }
2025-11-25 14:03:20 +08:00
if tensor_parallel_size is not None :
engine_args [ " tensor_parallel_size " ] = tensor_parallel_size
2025-03-17 18:00:17 +08:00
llm = LLM ( * * engine_args )
2025-10-23 08:15:38 +08:00
sampling_params = (
SamplingParams (
temperature = 0.0 , max_tokens = 256 , stop_token_ids = req_data . stop_token_ids
)
if req_data . sampling_params is None
else req_data . sampling_params
2025-05-26 17:57:54 +01:00
)
2025-03-17 18:00:17 +08:00
outputs = llm . chat (
2025-05-26 17:57:54 +01:00
[
{
" role " : " user " ,
" content " : [
{
" type " : " text " ,
" text " : question ,
2024-09-12 11:10:54 -06:00
} ,
2025-05-26 17:57:54 +01:00
* (
{
" type " : " image_url " ,
" image_url " : { " url " : image_url } ,
}
for image_url in image_urls
) ,
] ,
}
] ,
2024-09-12 11:10:54 -06:00
sampling_params = sampling_params ,
2024-09-22 06:56:20 -06:00
chat_template = req_data . chat_template ,
2025-04-11 12:57:16 +08:00
lora_request = req_data . lora_requests ,
2024-09-12 11:10:54 -06:00
)
2024-09-05 18:51:53 +08:00
2025-04-08 18:42:32 +08:00
print ( " - " * 50 )
2024-09-05 18:51:53 +08:00
for o in outputs :
generated_text = o . outputs [ 0 ] . text
print ( generated_text )
2025-04-08 18:42:32 +08:00
print ( " - " * 50 )
2024-09-05 18:51:53 +08:00
2025-04-15 16:05:30 +08:00
def parse_args ( ) :
2024-09-05 18:51:53 +08:00
parser = FlexibleArgumentParser (
2025-05-26 17:57:54 +01:00
description = " Demo on using vLLM for offline inference with "
" vision language models that support multi-image input for text "
" generation "
)
parser . add_argument (
" --model-type " ,
" -m " ,
type = str ,
default = " phi3_v " ,
choices = model_example_map . keys ( ) ,
help = ' Huggingface " model_type " . ' ,
)
parser . add_argument (
" --method " ,
type = str ,
default = " generate " ,
choices = [ " generate " , " chat " ] ,
help = " The method to run in `vllm.LLM`. " ,
)
parser . add_argument (
" --seed " ,
type = int ,
2025-12-11 11:59:39 +08:00
default = 0 ,
2025-05-26 17:57:54 +01:00
help = " Set the seed when initializing `vllm.LLM`. " ,
)
2025-04-10 02:36:27 -07:00
parser . add_argument (
" --num-images " ,
" -n " ,
2025-04-26 16:49:52 +08:00
type = int ,
2025-05-26 17:57:54 +01:00
choices = list ( range ( 1 , len ( IMAGE_URLS ) + 1 ) ) , # the max number of images
2025-04-10 02:36:27 -07:00
default = 2 ,
2025-05-26 17:57:54 +01:00
help = " Number of images to use for the demo. " ,
)
2025-11-25 14:03:20 +08:00
parser . add_argument (
" --tensor-parallel-size " ,
" -tp " ,
type = int ,
default = None ,
help = " Tensor parallel size to override the model ' s default setting. " ,
)
2025-04-15 16:05:30 +08:00
return parser . parse_args ( )
2024-09-05 18:51:53 +08:00
2025-04-15 16:05:30 +08:00
def main ( args : Namespace ) :
model = args . model_type
method = args . method
seed = args . seed
2025-11-25 14:03:20 +08:00
tensor_parallel_size = args . tensor_parallel_size
if tensor_parallel_size is not None and tensor_parallel_size < 1 :
raise ValueError (
f " tensor_parallel_size must be a positive integer, "
f " got { tensor_parallel_size } "
)
2025-04-15 16:05:30 +08:00
2025-05-26 17:57:54 +01:00
image_urls = IMAGE_URLS [ : args . num_images ]
2025-04-15 16:05:30 +08:00
if method == " generate " :
2025-11-25 14:03:20 +08:00
run_generate ( model , QUESTION , image_urls , seed , tensor_parallel_size )
2025-04-15 16:05:30 +08:00
elif method == " chat " :
2025-11-25 14:03:20 +08:00
run_chat ( model , QUESTION , image_urls , seed , tensor_parallel_size )
2025-04-15 16:05:30 +08:00
else :
raise ValueError ( f " Invalid method: { method } " )
if __name__ == " __main__ " :
args = parse_args ( )
2024-09-05 18:51:53 +08:00
main ( args )