examples/offline_inference/vision_language.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.

For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""

import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple

from huggingface_hub import snapshot_download
from transformers import AutoProcessor, AutoTokenizer

from vllm import LLM, EngineArgs, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode
from vllm.utils.argparse_utils import FlexibleArgumentParser


class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompts: list[str]
    stop_token_ids: list[int] | None = None
    lora_requests: list[LoRARequest] | None = None
    sampling_params: list[SamplingParams] | None = None


# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


# Aria
def run_aria(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "rhymes-ai/Aria"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        dtype="bfloat16",
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        (
            f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
            "<|im_end|>\n<|im_start|>assistant\n"
        )
        for question in questions
    ]

    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# Aya Vision
def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "CohereLabs/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"crop_to_patches": True},
        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Bee-8B
def run_bee(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "Open-Bee/Bee-8B-RL"

    prompts = [
        (
            f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<image>\n{question}<|im_end|>"
            f"<|im_start|>assistant\n<think>\n"
        )
        for question in questions
    ]

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        limit_mm_per_prompt={modality: 1},
        trust_remote_code=True,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "ByteDance-Seed/BAGEL-7B-MoT"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        (
            f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# BLIP-2
def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
    prompts = [f"Question: {question} Answer:" for question in questions]
    engine_args = EngineArgs(
        model="Salesforce/blip2-opt-2.7b",
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Chameleon
def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"{question}<image>" for question in questions]
    engine_args = EngineArgs(
        model="facebook/chameleon-7b",
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "CohereLabs/command-a-vision-07-2025"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768,
        tensor_parallel_size=4,
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Deepseek-VL2
def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "deepseek-ai/deepseek-vl2-tiny"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor

    assert modality == "image"

    model_name = "deepseek-ai/DeepSeek-OCR"

    engine_args = EngineArgs(
        model=model_name,
        limit_mm_per_prompt={modality: 1},
        logits_processors=[NGramPerReqLogitsProcessor],
    )

    # deepseek-ocr use plain prompt template
    prompts = [f"<image>\n{question}" for question in questions]

    # The following sampling params config is taken from
    # the official Deepseek-OCR inference example.
    # (IMPORTANT) Use the custom logits processor and avoid skipping
    # special tokens for this model for the optimal OCR performance.
    sampling_params = [
        SamplingParams(
            temperature=0.0,
            max_tokens=8192,
            # ngram logit processor args
            extra_args=dict(
                ngram_size=30,
                window_size=90,
                # whitelist: <td>, </td>
                whitelist_token_ids={128821, 128822},
            ),
            skip_special_tokens=False,
        )
        for _ in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        sampling_params=sampling_params,
    )


# Dots-OCR
def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
    engine_args = EngineArgs(
        model="rednote-hilab/dots.ocr",
        limit_mm_per_prompt={modality: 1},
        trust_remote_code=True,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Eagle2.5-VL
def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "nvidia/Eagle2.5-8B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for Eagle2.5 (Qwen2 based)
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# Ernie4.5-VL
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        limit_mm_per_prompt={modality: 1},
        trust_remote_code=True,
    )

    if modality == "image":
        placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
    elif modality == "video":
        placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"

    prompts = [
        (
            f"<|begin_of_sentence|>User: {question}{placeholder}\n"
            "Assistant: <think></think>"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Fuyu
def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"{question}\n" for question in questions]
    engine_args = EngineArgs(
        model="adept/fuyu-8b",
        max_model_len=2048,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Gemma 3
def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "google/gemma-3-4b-it"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        mm_processor_kwargs={"do_pan_and_scan": True},
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        (
            "<bos><start_of_turn>user\n"
            f"<start_of_image>{question}<end_of_turn>\n"
            "<start_of_turn>model\n"
        )
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Gemma3N
def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "google/gemma-3n-E2B-it"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
    )

    prompts = [
        (
            "<start_of_turn>user\n"
            f"<image_soft_token>{question}<end_of_turn>\n"
            "<start_of_turn>model\n"
        )
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# GLM-4v
def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "zai-org/glm-4v-9b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=2048,
        max_num_seqs=2,
        trust_remote_code=True,
        enforce_eager=True,
        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        (
            "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>"
            f"{question}<|assistant|>"
        )
        for question in questions
    ]

    stop_token_ids = [151329, 151336, 151338]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# GLM-4.1V
def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.1V-9B-Thinking"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        mm_processor_kwargs={
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
    )

    if modality == "image":
        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
    elif modality == "video":
        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"

    prompts = [
        (
            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
            f"{placeholder}"
            f"{question}<|assistant|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# GLM-4.5V
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        mm_processor_kwargs={
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
        tensor_parallel_size=4,
    )

    if modality == "image":
        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
    elif modality == "video":
        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"

    prompts = [
        (
            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
            f"{placeholder}"
            f"{question}<|assistant|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# GLM-4.5V-FP8
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V-FP8"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        mm_processor_kwargs={
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
        tensor_parallel_size=4,
    )

    if modality == "image":
        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
    elif modality == "video":
        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"

    prompts = [
        (
            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
            f"{placeholder}"
            f"{question}<|assistant|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "h2oai/h2ovl-mississippi-800m"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# HunyuanOCR
def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "tencent/HunyuanOCR"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        limit_mm_per_prompt={modality: 1},
    )

    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
    prompts = [
        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=None,
    )


# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
def run_hyperclovax_seed_vision(
    questions: list[str], modality: str
) -> ModelRequestData:
    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192 if modality == "image" else 16384,
        limit_mm_per_prompt={modality: 1},
    )

    messages = list()
    for question in questions:
        if modality == "image":
            """
            ocr: List the words in the image in raster order.
                Even if the word order feels unnatural for reading,
                the model will handle it as long as it follows raster order.
                e.g. "Naver, CLOVA, bigshane"
            lens_keywords: List the entity names in the image.
                e.g. "iPhone"
            lens_local_keywords: List the entity names with quads in the image.
                e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
            """
            messages.append(
                [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "ocr": "",
                                "lens_keywords": "",
                                "lens_local_keywords": "",
                            },
                            {
                                "type": "text",
                                "text": question,
                            },
                        ],
                    }
                ]
            )
        elif modality == "video":
            messages.append(
                [
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "video",
                            },
                            {
                                "type": "text",
                                "text": question,
                            },
                        ],
                    }
                ]
            )
        else:
            raise ValueError(f"Unsupported modality: {modality}")

    prompts = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=None,
    )


# Idefics3-8B-Llama3
def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {"longest_edge": 3 * 364},
        },
        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
        (f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Intern-S1
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "internlm/Intern-S1-mini"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
        enforce_eager=True,
    )

    if modality == "image":
        placeholder = "<IMG_CONTEXT>"
    elif modality == "video":
        placeholder = "<video>"

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"{placeholder}\n{question}"}]
        for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL3-2B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<image>"
    elif modality == "video":
        placeholder = "<video>"

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"{placeholder}\n{question}"}]
        for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# Kanana-V
def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "kakaocorp/kanana-1.5-v-3b-instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        trust_remote_code=True,
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Keye-VL
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        trust_remote_code=True,
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        (
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Keye-VL-1.5
def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-1.5-8B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        trust_remote_code=True,
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        (
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
        f"<|media_pad|><|media_end|>{question}<|im_end|>"
        "<|im_assistant|>assistant<|im_middle|>"
        for question in questions
    ]

    engine_args = EngineArgs(
        model="moonshotai/Kimi-VL-A3B-Instruct",
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LightOnOCR
def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        "<|im_start|>system<|im_end|>\n<|im_start|>user\n<|image_pad|><|im_end|>\n<|im_start|>assistant\n"
        for _ in questions
    ]

    engine_args = EngineArgs(
        model="lightonai/LightOnOCR-1B",
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


def run_lfm2_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "LiquidAI/LFM2-VL-450M"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        limit_mm_per_prompt={modality: 1},
    )

    processor = AutoProcessor.from_pretrained(model_name)
    messages = [
        [
            {
                "role": "user",
                "content": [{"type": "image"}, {"type": "text", "text": question}],
            }
        ]
        for question in questions
    ]
    prompts = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=4,
        tensor_parallel_size=8,
        gpu_memory_utilization=0.4,
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [
        [
            {
                "role": "user",
                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
            }
        ]
        for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=False
    )
    stop_token_ids = None
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"USER: <image>\n{question}\nASSISTANT:" for question in questions]

    engine_args = EngineArgs(
        model="llava-hf/llava-1.5-7b-hf",
        max_model_len=4096,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LLaVA-1.6/LLaVA-NeXT
def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
    engine_args = EngineArgs(
        model="llava-hf/llava-v1.6-mistral-7b-hf",
        max_model_len=8192,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "video"

    prompts = [f"USER: <video>\n{question} ASSISTANT:" for question in questions]
    engine_args = EngineArgs(
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# LLaVA-OneVision
def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
    if modality == "video":
        prompts = [
            f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
            for question in questions
        ]

    elif modality == "image":
        prompts = [
            f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
            for question in questions
        ]

    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Mantis
def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # noqa: E501
    prompts = [llama3_template.format(f"{question}\n<image>") for question in questions]

    engine_args = EngineArgs(
        model="TIGER-Lab/Mantis-8B-siglip-llama3",
        max_model_len=4096,
        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
        limit_mm_per_prompt={modality: 1},
    )
    stop_token_ids = [128009]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# MiniCPM-V
def run_minicpmv_base(questions: list[str], modality: str, model_name):
    assert modality in ["image", "video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa

    # 2.0
    # The official repo doesn't work yet, so we need to use a fork for now
    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
    # model_name = "HwwwH/MiniCPM-V-2"

    # 2.5
    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"

    # 2.6
    # model_name = "openbmb/MiniCPM-V-2_6"
    # o2.6

    # modality supports
    # 2.0: image
    # 2.5: image
    # 2.6: image, video
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        limit_mm_per_prompt={modality: 1},
    )
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
    # stop_token_ids = [tokenizer.eos_id]

    # 2.5
    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]

    # 2.6 / o2.6
    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    modality_placeholder = {
        "image": "(<image>./</image>)",
        "video": "(<video>./</video>)",
    }

    prompts = [
        tokenizer.apply_chat_template(
            [
                {
                    "role": "user",
                    "content": f"{modality_placeholder[modality]}\n{question}",
                }
            ],
            tokenize=False,
            add_generation_prompt=True,
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")


def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")


def run_minimax_vl_01(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "MiniMaxAI/MiniMax-VL-01"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
        trust_remote_code=True,
        tensor_parallel_size=8,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    messages = [
        [
            {
                "role": "user",
                "content": [{"type": "image"}, {"type": "text", "text": question}],
            }
        ]
        for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, tokenize=False
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={modality: 1},
        ignore_patterns=["consolidated.safetensors"],
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Molmo
def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "allenai/Molmo-7B-D-0924"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Molmo2
def run_molmo2(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "allenai/Molmo2-8B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={modality: 1},
        max_num_batched_tokens=36864,
    )

    if modality == "image":
        placeholder = "<|image|>"
    elif modality == "video":
        placeholder = "<|video|>"
    else:
        raise ValueError(f"Unsupported modality for molmo2: {modality}")

    prompts = [
        f"{placeholder}<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Nemontron_VL
def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={modality: 1},
    )

    assert modality == "image"
    placeholder = "<image>"

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"{placeholder}\n{question}"}]
        for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# NVLM-D
def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        tensor_parallel_size=4,
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Ovis
def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Ovis2_5
def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "AIDC-AI/Ovis2.5-2B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={modality: 1},
    )
    if modality == "image":
        placeholder = "<image>"
    elif modality == "video":
        placeholder = "<video>"

    prompts = [
        f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# PaddleOCR-VL
def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "PaddlePaddle/PaddleOCR-VL"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
        trust_remote_code=True,
    )

    placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
    prompts = [
        (f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# PaliGemma
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma has special prompt format for VQA
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma-3b-mix-224",
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# PaliGemma 2
def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    # PaliGemma 2 has special prompt format for VQA
    prompts = ["caption en" for _ in questions]
    engine_args = EngineArgs(
        model="google/paligemma2-3b-ft-docci-448",
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Phi-3-Vision
def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    prompts = [
        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
        for question in questions
    ]

    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"num_crops": 16},
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Phi-4-multimodal-instruct
def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process image inputs.
    """
    assert modality == "image"
    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    prompts = [
        f"<|user|><|image_1|>{question}<|end|><|assistant|>" for question in questions
    ]
    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=5120,
        max_num_seqs=2,
        max_num_batched_tokens=12800,
        enable_lora=True,
        max_lora_rank=320,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 16},
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )


# Pixtral HF-format
def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "mistral-community/pixtral-12b"

    # NOTE: Need L40 (or equivalent) to avoid OOM
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=6144,
        max_num_seqs=2,
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen-VL
def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    engine_args = EngineArgs(
        model="Qwen/Qwen-VL",
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        limit_mm_per_prompt={modality: 1},
    )

    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen2-VL
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen2-VL-7B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
        },
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen2.5-VL
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen2.5-Omni
def run_qwen2_5_omni(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-Omni-7B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|IMAGE|>"
    elif modality == "video":
        placeholder = "<|VIDEO|>"

    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
        "Group, capable of perceiving auditory and visual inputs, as well as "
        "generating text and speech."
    )

    prompts = [
        (
            f"<|im_start|>system\n{default_system}<|im_end|>\n"
            f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]
    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen3-VL-Dense
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen3-VL-4B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Qwen3-VL-MOE
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
        mm_processor_kwargs={
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# R-4B
def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "YannQi/R-4B"

    prompts = [
        f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
        for question in questions
    ]

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        limit_mm_per_prompt={modality: 1},
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "Skywork/Skywork-R1V-38B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={modality: 1},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
    ]
    prompts = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for SkyworkR1V
    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
        stop_token_ids=stop_token_ids,
    )


# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        enforce_eager=True,
        mm_processor_kwargs={
            "max_image_size": {"longest_edge": 384},
        },
        limit_mm_per_prompt={modality: 1},
    )
    prompts = [
        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# Step3
def run_step3(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"

    model_name = "stepfun-ai/step3-fp8"

    # NOTE: Below are verified configurations for step3-fp8
    # on 8xH100 GPUs.
    engine_args = EngineArgs(
        model=model_name,
        max_num_batched_tokens=4096,
        gpu_memory_utilization=0.85,
        tensor_parallel_size=8,
        limit_mm_per_prompt={modality: 1},
        reasoning_parser="step3",
    )

    prompts = [
        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
        f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


# omni-research/Tarsier-7b
def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
    assert modality == "image"
    model_name = "omni-research/Tarsier-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={modality: 1},
    )
    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "omni-research/Tarsier2-Recap-7b"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        hf_overrides={
            "architectures": ["Tarsier2ForConditionalGeneration"],
            "model_type": "tarsier2",
        },
        limit_mm_per_prompt={modality: 1},
    )

    if modality == "image":
        placeholder = "<|image_pad|>"
    elif modality == "video":
        placeholder = "<|video_pad|>"

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        for question in questions
    ]

    return ModelRequestData(
        engine_args=engine_args,
        prompts=prompts,
    )


model_example_map = {
    "aria": run_aria,
    "aya_vision": run_aya_vision,
    "bagel": run_bagel,
    "bee": run_bee,
    "blip-2": run_blip2,
    "chameleon": run_chameleon,
    "command_a_vision": run_command_a_vision,
    "deepseek_vl_v2": run_deepseek_vl2,
    "deepseek_ocr": run_deepseek_ocr,
    "dots_ocr": run_dots_ocr,
    "eagle2_5": run_eagle2_5,
    "ernie45_vl": run_ernie45_vl,
    "fuyu": run_fuyu,
    "gemma3": run_gemma3,
    "gemma3n": run_gemma3n,
    "glm4v": run_glm4v,
    "glm4_1v": run_glm4_1v,
    "glm4_5v": run_glm4_5v,
    "glm4_5v_fp8": run_glm4_5v_fp8,
    "h2ovl_chat": run_h2ovl,
    "hunyuan_vl": run_hunyuan_vl,
    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
    "idefics3": run_idefics3,
    "interns1": run_interns1,
    "internvl_chat": run_internvl,
    "kanana_v": run_kanana_v,
    "keye_vl": run_keye_vl,
    "keye_vl1_5": run_keye_vl1_5,
    "kimi_vl": run_kimi_vl,
    "lightonocr": run_lightonocr,
    "lfm2_vl": run_lfm2_vl,
    "llama4": run_llama4,
    "llava": run_llava,
    "llava-next": run_llava_next,
    "llava-next-video": run_llava_next_video,
    "llava-onevision": run_llava_onevision,
    "mantis": run_mantis,
    "minicpmo": run_minicpmo,
    "minicpmv": run_minicpmv,
    "minimax_vl_01": run_minimax_vl_01,
    "mistral3": run_mistral3,
    "molmo": run_molmo,
    "molmo2": run_molmo2,
    "nemotron_vl": run_nemotron_vl,
    "NVLM_D": run_nvlm_d,
    "ovis": run_ovis,
    "ovis2_5": run_ovis2_5,
    "paddleocr_vl": run_paddleocr_vl,
    "paligemma": run_paligemma,
    "paligemma2": run_paligemma2,
    "phi3_v": run_phi3v,
    "phi4_mm": run_phi4mm,
    "pixtral_hf": run_pixtral_hf,
    "qwen_vl": run_qwen_vl,
    "qwen2_vl": run_qwen2_vl,
    "qwen2_5_vl": run_qwen2_5_vl,
    "qwen2_5_omni": run_qwen2_5_omni,
    "qwen3_vl": run_qwen3_vl,
    "qwen3_vl_moe": run_qwen3_vl_moe,
    "rvl": run_r_vl,
    "skywork_chat": run_skyworkr1v,
    "smolvlm": run_smolvlm,
    "step3": run_step3,
    "tarsier": run_tarsier,
    "tarsier2": run_tarsier2,
}


MODELS_NEED_VIDEO_METADATA = [
    "glm4_1v",
    "glm4_5v",
    "glm4_5v_fp8",
    "molmo2",
    "qwen3_vl",
    "qwen3_vl_moe",
]


def get_multi_modal_input(args):
    """
    return {
        "data": image or video,
        "question": question,
    }
    """
    if args.modality == "image":
        # Input image and question
        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
        img_questions = [
            "What is the content of this image?",
            "Describe the content of this image in detail.",
            "What's in the image?",
            "Where is this image taken?",
        ]

        return {
            "data": image,
            "questions": img_questions,
        }

    if args.modality == "video":
        # Input video and question
        needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
        video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
        metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
        vid_questions = ["Why is this video funny?"]

        return {
            "data": ([(video, metadata)] if needs_metadata else video),
            "questions": vid_questions,
        }

    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)


def apply_image_repeat(
    image_repeat_prob, num_prompts, data, prompts: list[str], modality
):
    """Repeats images with provided probability of "image_repeat_prob".
    Used to simulate hit/miss for the MM preprocessor cache.
    """
    assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0
    no_yes = [0, 1]
    probs = [1.0 - image_repeat_prob, image_repeat_prob]

    inputs = []
    inputs_with_empty_media = []
    cur_image = data
    for i in range(num_prompts):
        if image_repeat_prob is not None:
            res = random.choices(no_yes, probs)[0]
            if res == 0:
                # No repeat => Modify one pixel
                cur_image = cur_image.copy()
                new_val = (i // 256 // 256, i // 256, i % 256)
                cur_image.putpixel((0, 0), new_val)

        uuid = "uuid_{}".format(i)

        inputs.append(
            {
                "prompt": prompts[i % len(prompts)],
                "multi_modal_data": {modality: cur_image},
                "multi_modal_uuids": {modality: uuid},
            }
        )

        inputs_with_empty_media.append(
            {
                "prompt": prompts[i % len(prompts)],
                "multi_modal_data": {modality: None},
                "multi_modal_uuids": {modality: uuid},
            }
        )

    return inputs, inputs_with_empty_media


@contextmanager
def time_counter(enable: bool):
    if enable:
        import time

        start_time = time.time()
        yield
        elapsed_time = time.time() - start_time
        print("-" * 50)
        print("-- generate time = {}".format(elapsed_time))
        print("-" * 50)
    else:
        yield


def parse_args():
    parser = FlexibleArgumentParser(
        description="Demo on using vLLM for offline inference with "
        "vision language models for text generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="llava",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--num-prompts", type=int, default=4, help="Number of prompts to run."
    )
    parser.add_argument(
        "--modality",
        type=str,
        default="image",
        choices=["image", "video"],
        help="Modality of the input.",
    )
    parser.add_argument(
        "--num-frames",
        type=int,
        default=16,
        help="Number of frames to extract from the video.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )

    parser.add_argument(
        "--image-repeat-prob",
        type=float,
        default=None,
        help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)",
    )

    parser.add_argument(
        "--disable-mm-processor-cache",
        action="store_true",
        help="If True, disables caching of multi-modal processor.",
    )

    parser.add_argument(
        "--time-generate",
        action="store_true",
        help="If True, then print the total generate() call time",
    )

    parser.add_argument(
        "--use-different-prompt-per-request",
        action="store_true",
        help="If True, then use different prompt (with the same multi-modal "
        "data) for each request.",
    )

    parser.add_argument(
        "--verify-mm-cache-hit-with-uuids",
        action="store_true",
        help="If True, will send all requests in a second batch with empty mm "
        "data to verify cache hits with UUIDs.",
    )
    parser.add_argument(
        "--tensor-parallel-size",
        "-tp",
        type=int,
        default=None,
        help="Tensor parallel size to override the model's default setting. ",
    )
    return parser.parse_args()


def main(args):
    model = args.model_type
    if model not in model_example_map:
        raise ValueError(f"Model type {model} is not supported.")

    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
        raise ValueError(
            f"tensor_parallel_size must be a positive integer, "
            f"got {args.tensor_parallel_size}"
        )

    modality = args.modality
    mm_input = get_multi_modal_input(args)
    data = mm_input["data"]
    questions = mm_input["questions"]

    req_data = model_example_map[model](questions, modality)

    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )

    engine_args = asdict(req_data.engine_args) | {
        "seed": args.seed,
        "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
    }
    if args.tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
    llm = LLM(**engine_args)

    # Don't want to check the flag multiple times, so just hijack `prompts`.
    prompts = (
        req_data.prompts
        if args.use_different_prompt_per_request
        else [req_data.prompts[0]]
    )

    # We set temperature to 0.2 so that outputs can be different
    # even when all prompts are identical when running batch inference.
    sampling_params = (
        SamplingParams(
            temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
        )
        if req_data.sampling_params is None
        else req_data.sampling_params
    )

    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        uuid = "uuid_0"
        inputs = {
            "prompt": prompts[0],
            "multi_modal_data": {modality: data},
            "multi_modal_uuids": {modality: uuid},
        }
        inputs_with_empty_media = {
            "prompt": prompts[0],
            "multi_modal_data": {modality: None},
            "multi_modal_uuids": {modality: uuid},
        }
    else:
        # Batch inference
        if args.image_repeat_prob is not None:
            # Repeat images with specified probability of "image_repeat_prob"
            inputs, inputs_with_empty_media = apply_image_repeat(
                args.image_repeat_prob,
                args.num_prompts,
                data,
                prompts,
                modality,
            )
        else:
            # Use the same image for all prompts
            inputs = []
            inputs_with_empty_media = []
            for i in range(args.num_prompts):
                uuid = "uuid_{}".format(i)
                inputs.append(
                    {
                        "prompt": prompts[i % len(prompts)],
                        "multi_modal_data": {modality: data},
                        "multi_modal_uuids": {modality: uuid},
                    }
                )
                inputs_with_empty_media.append(
                    {
                        "prompt": prompts[i % len(prompts)],
                        "multi_modal_data": {modality: None},
                        "multi_modal_uuids": {modality: uuid},
                    }
                )

    # Add LoRA request if applicable
    lora_request = (
        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
    )

    with time_counter(args.time_generate):
        outputs = llm.generate(
            inputs,
            sampling_params=sampling_params,
            lora_request=lora_request,
        )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)

    if args.verify_mm_cache_hit_with_uuids:
        try:
            # Verify cache hits with UUIDs
            print(
                "Sending a second batch of requests with empty media"
                " and matching UUIDs."
            )
            outputs = llm.generate(
                inputs_with_empty_media,
                sampling_params=sampling_params,
                lora_request=lora_request,
            )
            print("-" * 50)
            for o in outputs:
                generated_text = o.outputs[0].text
                print(generated_text)
                print("-" * 50)
        except Exception as e:
            print(f"Failed to verify cache hits with UUIDs. Error: {e}")


if __name__ == "__main__":
    args = parse_args()
    main(args)
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
+								# SPDX-License-Identifier: Apache-2.0
-												[Misc] Add SPDX-FileCopyrightText  (#19100)

Signed-off-by: simon-mo <simon.mo@hey.com>
											
										
										
											2025-06-03 11:20:17 -07:00
+								# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								"""
-												[Model] Support E5-V (#9576)


											
										
										
											2024-10-23 11:35:29 +08:00
+								This example shows how to use vLLM for running offline inference with
 								the correct prompt format on vision language models for text generation.
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								For most models, the prompt format should follow corresponding examples
 								on HuggingFace model repository.
 								"""
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								import os
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								import random
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								from contextlib import contextmanager
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								from dataclasses import asdict
 								from typing import NamedTuple
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								from huggingface_hub import snapshot_download
-												[Model] Add LFM2-VL model support (#31758)

Signed-off-by: Tianshu Yu <tianshuyu.formal@gmail.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2026-01-08 05:00:27 -08:00
+								from transformers import AutoProcessor, AutoTokenizer
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								from vllm import LLM, EngineArgs, SamplingParams
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								from vllm.assets.image import ImageAsset
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								from vllm.assets.video import VideoAsset
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								from vllm.lora.request import LoRARequest
-												Re-submit: Fix: Proper RGBA -> RGB conversion for PIL images. (#18569)

Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
											
										
										
											2025-05-22 18:59:18 -07:00
+								from vllm.multimodal.image import convert_image_mode
-												[Chore]:Extract math and argparse utilities to separate modules (#27188)

Signed-off-by: Yeshwanth Surya <yeshsurya@gmail.com>
Signed-off-by: Yeshwanth N <yeshsurya@gmail.com>
Signed-off-by: yeshsurya <yeshsurya@gmail.com>
											
										
										
											2025-10-26 16:33:32 +05:30
+								from vllm.utils.argparse_utils import FlexibleArgumentParser
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								class ModelRequestData(NamedTuple):
 								    engine_args: EngineArgs
 								    prompts: list[str]
 								    stop_token_ids: list[int] | None = None
 								    lora_requests: list[LoRARequest] | None = None
-												[Model] Upstream Deepseek-OCR model (#27247)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-22 22:59:15 +08:00
+								    sampling_params: list[SamplingParams] | None = None
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 								# lower-end GPUs.
 								# Unless specified, these settings have been tested to work on a single L4.
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Aria
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_aria(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    model_name = "rhymes-ai/Aria"
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								    # NOTE: Need L40 (or equivalent) to avoid OOM
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        dtype="bfloat16",
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompts = [
 								        (
 								            f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
 								            "<|im_end|>\n<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								# Aya Vision
 								def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
-												[Misc] Rename CohereForAI references to CohereLabs (#30147)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-12-05 14:41:40 -05:00
+								    model_name = "CohereLabs/aya-vision-8b"
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=2048,
 								        max_num_seqs=2,
 								        mm_processor_kwargs={"crop_to_patches": True},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								    )
 								    prompts = [
 								        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Support Bee-8B Model (#27012)

Signed-off-by: uyzhang <yi.zhang.4096@gmail.com>
Signed-off-by: Yi Zhang <zhangyi970819@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-20 10:31:26 +08:00
+								# Bee-8B
 								def run_bee(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "Open-Bee/Bee-8B-RL"
 								    prompts = [
 								        (
 								            f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								            f"<|im_start|>user\n<image>\n{question}<|im_end|>"
 								            f"<|im_start|>assistant\n<think>\n"
 								        )
 								        for question in questions
 								    ]
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=16384,
 								        limit_mm_per_prompt={modality: 1},
 								        trust_remote_code=True,
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[New Model] BAGEL support (AR only) (#28439)

Signed-off-by: princepride <wangzhipeng628@gmail.com>
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-12-15 14:58:23 +08:00
+								def run_bagel(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "ByteDance-Seed/BAGEL-7B-MoT"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    prompts = [
 								        (
 								            f"<|im_start|>user\n<|image_pad|>\n{question}<|im_end|>\n"
 								            f"<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# BLIP-2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
 								    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"Question: {question} Answer:" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[V1] Support any head size for FlexAttention backend (#20467)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-07-07 00:54:36 +08:00
+								        model="Salesforce/blip2-opt-2.7b",
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								# Chameleon
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"{question}<image>" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="facebook/chameleon-7b",
 								        max_model_len=4096,
 								        max_num_seqs=2,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[New Model] Support Command-A-Vision (#22660)

Signed-off-by: donglu <donglu@cohere.com>
											
										
										
											2025-08-12 04:39:54 -04:00
+								def run_command_a_vision(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "CohereLabs/command-a-vision-07-2025"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=32768,
 								        tensor_parallel_size=4,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    prompts = [
 								        f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><|IMG_PATCH|>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								# Deepseek-VL2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								    assert modality == "image"
-												[Model] Add support for deepseek-vl2-tiny model (#12068)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-01-17 01:14:48 +08:00
+								    model_name = "deepseek-ai/deepseek-vl2-tiny"
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												[Model] Upstream Deepseek-OCR model (#27247)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-22 22:59:15 +08:00
+								def run_deepseek_ocr(questions: list[str], modality: str) -> ModelRequestData:
 								    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
 								    assert modality == "image"
 								    model_name = "deepseek-ai/DeepSeek-OCR"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        limit_mm_per_prompt={modality: 1},
 								        logits_processors=[NGramPerReqLogitsProcessor],
 								    )
 								    # deepseek-ocr use plain prompt template
 								    prompts = [f"<image>\n{question}" for question in questions]
 								    # The following sampling params config is taken from
 								    # the official Deepseek-OCR inference example.
 								    # (IMPORTANT) Use the custom logits processor and avoid skipping
 								    # special tokens for this model for the optimal OCR performance.
 								    sampling_params = [
 								        SamplingParams(
 								            temperature=0.0,
 								            max_tokens=8192,
 								            # ngram logit processor args
 								            extra_args=dict(
 								                ngram_size=30,
 								                window_size=90,
 								                # whitelist: <td>, </td>
 								                whitelist_token_ids={128821, 128822},
 								            ),
 								            skip_special_tokens=False,
 								        )
 								        for _ in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        sampling_params=sampling_params,
 								    )
 								# Dots-OCR
 								def run_dots_ocr(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    prompts = [f"<|img|><|imgpad|><|endofimg|>{question}" for question in questions]
 								    engine_args = EngineArgs(
 								        model="rednote-hilab/dots.ocr",
 								        limit_mm_per_prompt={modality: 1},
 								        trust_remote_code=True,
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Add Eagle2.5-8B Vision-Language Model support   (#32456)

Signed-off-by: kimheesu <wlskaka4@gmail.com>
											
										
										
											2026-01-21 18:39:53 +09:00
+								# Eagle2.5-VL
 								def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "nvidia/Eagle2.5-8B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        trust_remote_code=True,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    # Stop tokens for Eagle2.5 (Qwen2 based)
 								    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 								    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Model] Add Ernie4.5 VL Model Support (#22514)

Signed-off-by: wangyafeng <wangyafeng@baidu.com>
											
										
										
											2025-08-27 12:02:55 +08:00
+								# Ernie4.5-VL
 								def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=5,
 								        limit_mm_per_prompt={modality: 1},
 								        trust_remote_code=True,
 								    )
 								    if modality == "image":
 								        placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
 								    elif modality == "video":
 								        placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
 								    prompts = [
 								        (
 								            f"<|begin_of_sentence|>User: {question}{placeholder}\n"
 								            "Assistant: <think></think>"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Fuyu
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"{question}\n" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="adept/fuyu-8b",
 								        max_model_len=2048,
 								        max_num_seqs=2,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								# Gemma 3
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								    assert modality == "image"
 								    model_name = "google/gemma-3-4b-it"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-03-13 17:23:12 +08:00
+								        model=model_name,
 								        max_model_len=2048,
 								        max_num_seqs=2,
-												[Model] Revert PR #26715: Restore custom PaliGemma and Gemma3-MM impl… (#27309)

Signed-off-by: Luciano Martins <lucianommartins@users.noreply.github.com>
Co-authored-by: Luciano Martins <lucianommartins@users.noreply.github.com>
											
										
										
											2025-10-22 14:05:34 -03:00
+								        mm_processor_kwargs={"do_pan_and_scan": True},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-03-13 17:23:12 +08:00
+								    )
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompts = [
 								        (
 								            "<bos><start_of_turn>user\n"
 								            f"<start_of_image>{question}<end_of_turn>\n"
 								            "<start_of_turn>model\n"
 								        )
 								        for question in questions
 								    ]
-												[Model] Gemma3n MM (#20495)

Signed-off-by: ShriKode <shrikode@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: ShriKode <shrikode@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-09 18:56:25 +02:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
-												[Model] Gemma3n MM (#20495)

Signed-off-by: ShriKode <shrikode@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: ShriKode <shrikode@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-09 18:56:25 +02:00
+								# Gemma3N
 								def run_gemma3n(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "google/gemma-3n-E2B-it"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=2048,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={modality: 1},
 								        enforce_eager=True,
 								    )
 								    prompts = [
 								        (
 								            "<start_of_turn>user\n"
 								            f"<image_soft_token>{question}<end_of_turn>\n"
 								            "<start_of_turn>model\n"
 								        )
 								        for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# GLM-4v
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
-												[Misc] Modify the organization of GLM series  (#22171)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-08-04 14:51:20 +08:00
+								    model_name = "zai-org/glm-4v-9b"
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=2048,
 								        max_num_seqs=2,
 								        trust_remote_code=True,
 								        enforce_eager=True,
 								        hf_overrides={"architectures": ["GLM4VForCausalLM"]},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[VLM] Separate text-only and vision variants of the same model architecture (#13157)


											
										
										
											2025-02-13 22:19:15 +08:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
-												[Bugfix] Fix extra whitespace in strings caused by newline (#23272)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-21 13:03:00 +08:00
+								        (
 								            "<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>"
 								            f"{question}<|assistant|>"
 								        )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ]
-												[VLM] Merged multi-modal processor for GLM4V (#12449)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-02-09 04:32:16 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    stop_token_ids = [151329, 151336, 151338]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												Add GLM4.1V model (Draft) (#19331)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-01 20:48:26 +08:00
+								# GLM-4.1V
 								def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
-												[Misc] Modify the organization of GLM series  (#22171)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-08-04 14:51:20 +08:00
+								    model_name = "zai-org/GLM-4.1V-9B-Thinking"
-												Add GLM4.1V model (Draft) (#19331)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-01 20:48:26 +08:00
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        mm_processor_kwargs={
 								            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
 								            "fps": 1,
 								        },
 								        limit_mm_per_prompt={modality: 1},
 								        enforce_eager=True,
 								    )
 								    if modality == "image":
 								        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
 								    elif modality == "video":
 								        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
 								    prompts = [
 								        (
 								            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
 								            f"{placeholder}"
 								            f"{question}<|assistant|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-08-19 16:56:31 +09:00
+								# GLM-4.5V
 								def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "zai-org/GLM-4.5V"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        mm_processor_kwargs={
 								            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
 								            "fps": 1,
 								        },
 								        limit_mm_per_prompt={modality: 1},
 								        enforce_eager=True,
 								        tensor_parallel_size=4,
 								    )
 								    if modality == "image":
 								        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
 								    elif modality == "video":
 								        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
 								    prompts = [
 								        (
 								            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
 								            f"{placeholder}"
 								            f"{question}<|assistant|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
 								# GLM-4.5V-FP8
 								def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "zai-org/GLM-4.5V-FP8"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        mm_processor_kwargs={
 								            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
 								            "fps": 1,
 								        },
 								        limit_mm_per_prompt={modality: 1},
 								        enforce_eager=True,
 								        tensor_parallel_size=4,
 								    )
 								    if modality == "image":
 								        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
 								    elif modality == "video":
 								        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
 								    prompts = [
 								        (
 								            "[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
 								            f"{placeholder}"
 								            f"{question}<|assistant|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# H2OVL-Mississippi
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
-												[V1] Update doc and examples for H2O-VL (#13349)

Signed-off-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-02-16 02:35:54 -08:00
+								    model_name = "h2oai/h2ovl-mississippi-800m"
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    # Stop tokens for H2OVL-Mississippi
-												[V1] Update doc and examples for H2O-VL (#13349)

Signed-off-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-02-16 02:35:54 -08:00
+								    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    stop_token_ids = [tokenizer.eos_token_id]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Model] Add HunyuanOCR support (#29327)

Signed-off-by: manayang <jackmanayang@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: sergeywang <sergeywang@tencent.com>
Co-authored-by: manayang <jackmanayang@gmail.com>
Co-authored-by: manayang <manayang@tencent.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-11-25 11:28:51 +08:00
+								# HunyuanOCR
 								def run_hunyuan_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "tencent/HunyuanOCR"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    placeholder = "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
 								    prompts = [
 								        f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=None,
 								    )
-												[MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)

Signed-off-by: bigshanedogg <bigshane319@gmail.com>
											
										
										
											2025-07-25 22:05:42 +09:00
+								# naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B
 								def run_hyperclovax_seed_vision(
 								    questions: list[str], modality: str
 								) -> ModelRequestData:
 								    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
 								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192 if modality == "image" else 16384,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    messages = list()
 								    for question in questions:
 								        if modality == "image":
 								            """
-												Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-08-19 16:56:31 +09:00
+								            ocr: List the words in the image in raster order.
 								                Even if the word order feels unnatural for reading,
-												[MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)

Signed-off-by: bigshanedogg <bigshane319@gmail.com>
											
										
										
											2025-07-25 22:05:42 +09:00
+								                the model will handle it as long as it follows raster order.
 								                e.g. "Naver, CLOVA, bigshane"
 								            lens_keywords: List the entity names in the image.
 								                e.g. "iPhone"
 								            lens_local_keywords: List the entity names with quads in the image.
 								                e.g. "[0.07, 0.21, 0.92, 0.90] iPhone"
 								            """
 								            messages.append(
 								                [
 								                    {
 								                        "role": "user",
 								                        "content": [
 								                            {
 								                                "type": "image",
 								                                "ocr": "",
 								                                "lens_keywords": "",
 								                                "lens_local_keywords": "",
 								                            },
 								                            {
 								                                "type": "text",
 								                                "text": question,
 								                            },
 								                        ],
 								                    }
 								                ]
 								            )
 								        elif modality == "video":
 								            messages.append(
 								                [
 								                    {
 								                        "role": "user",
 								                        "content": [
 								                            {
 								                                "type": "video",
 								                            },
 								                            {
 								                                "type": "text",
 								                                "text": question,
 								                            },
 								                        ],
 								                    }
 								                ]
 								            )
 								        else:
 								            raise ValueError(f"Unsupported modality: {modality}")
 								    prompts = tokenizer.apply_chat_template(
 								        messages,
 								        tokenize=False,
 								        add_generation_prompt=True,
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=None,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Idefics3-8B-Llama3
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    assert modality == "image"
 								    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        enforce_eager=True,
 								        # if you are running out of memory, you can reduce the "longest_edge".
 								        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
 								        mm_processor_kwargs={
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            "size": {"longest_edge": 3 * 364},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        },
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompts = [
 								        (f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:")
 								        for question in questions
 								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-26 19:14:04 +08:00
+								# Intern-S1
 								def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Use `merge_by_field_config` for MM models (InternVL family) (#26153)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-03 16:59:06 +08:00
+								    model_name = "internlm/Intern-S1-mini"
-												Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-26 19:14:04 +08:00
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={modality: 1},
 								        enforce_eager=True,
 								    )
-												[VLM] Add video support for Intern-S1 (#21671)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-27 19:49:43 +08:00
+								    if modality == "image":
 								        placeholder = "<IMG_CONTEXT>"
 								    elif modality == "video":
 								        placeholder = "<video>"
-												Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-26 19:14:04 +08:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"{placeholder}\n{question}"}]
 								        for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# InternVL
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
-												[VLM] Initialize video input support for InternVL models (#18499)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-05-25 12:51:25 +08:00
+								    model_name = "OpenGVLab/InternVL3-2B"
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
-												[VLM] Initialize video input support for InternVL models (#18499)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-05-25 12:51:25 +08:00
+								        max_model_len=8192,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    )
-												[VLM] Initialize video input support for InternVL models (#18499)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-05-25 12:51:25 +08:00
+								    if modality == "image":
 								        placeholder = "<image>"
 								    elif modality == "video":
 								        placeholder = "<video>"
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"{placeholder}\n{question}"}]
 								        for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    # Stop tokens for InternVL
 								    # models variants may have different stop tokens
 								    # please refer to the model card for the correct "stop words":
 								    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
 								    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[MODEL] New model support for kakaocorp/kanana-1.5-v-3b-instruct (#29384)

Signed-off-by: Jaehyun An <steve.ai@kakaocorp.com>
											
										
										
											2026-01-13 01:39:02 +09:00
+								# Kanana-V
 								def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "kakaocorp/kanana-1.5-v-3b-instruct"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        trust_remote_code=True,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
+								# Keye-VL
 								def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        trust_remote_code=True,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
 								    prompts = [
 								        (
 								            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
+								# Keye-VL-1.5
 								def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "Kwai-Keye/Keye-VL-1.5-8B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        trust_remote_code=True,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
 								    prompts = [
 								        (
 								            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
+								# Kimi-VL
 								def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    prompts = [
 								        "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
 								        f"<|media_pad|><|media_end|>{question}<|im_end|>"
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        "<|im_assistant|>assistant<|im_middle|>"
 								        for question in questions
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
+								    ]
 								    engine_args = EngineArgs(
 								        model="moonshotai/Kimi-VL-A3B-Instruct",
 								        trust_remote_code=True,
-												[Misc] Clean up Kimi-VL (#16833)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-18 20:15:09 +08:00
+								        max_model_len=4096,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Add support for LightOnOCR (#26916)

Signed-off-by: Said Taghadouini <taghadouinisaid@gmail.com>
Signed-off-by: Said Taghadouini <84044788+staghado@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-10-17 07:05:24 +02:00
+								# LightOnOCR
 								def run_lightonocr(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    prompts = [
 								        "<|im_start|>system<|im_end|>\n<|im_start|>user\n<|image_pad|><|im_end|>\n<|im_start|>assistant\n"
 								        for _ in questions
 								    ]
 								    engine_args = EngineArgs(
 								        model="lightonai/LightOnOCR-1B",
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Add LFM2-VL model support (#31758)

Signed-off-by: Tianshu Yu <tianshuyu.formal@gmail.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2026-01-08 05:00:27 -08:00
+								def run_lfm2_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "LiquidAI/LFM2-VL-450M"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    processor = AutoProcessor.from_pretrained(model_name)
 								    messages = [
 								        [
 								            {
 								                "role": "user",
 								                "content": [{"type": "image"}, {"type": "text", "text": question}],
 								            }
 								        ]
 								        for question in questions
 								    ]
 								    prompts = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=4,
 								        tensor_parallel_size=8,
 								        gpu_memory_utilization=0.4,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name)
 								    messages = [
 								        [
 								            {
 								                "role": "user",
 								                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
 								            }
 								        ]
 								        for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, add_generation_prompt=True, tokenize=False
 								    )
 								    stop_token_ids = None
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								# LLaVA-1.5
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_llava(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompts = [f"USER: <image>\n{question}\nASSISTANT:" for question in questions]
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/llava-1.5-7b-hf",
 								        max_model_len=4096,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								# LLaVA-1.6/LLaVA-NeXT
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"[INST] <image>\n{question} [/INST]" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/llava-v1.6-mistral-7b-hf",
 								        max_model_len=8192,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
 								# LlaVA-NeXT-Video
 								# Currently only support for video input
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "video"
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompts = [f"USER: <video>\n{question} ASSISTANT:" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
 								        max_model_len=8192,
-												[Misc] Clean up MiniCPM-V/O code (#15337)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-25 18:22:52 +08:00
+								        max_num_seqs=2,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								# LLaVA-OneVision
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    if modality == "video":
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        prompts = [
-												[Bugfix] Fix extra whitespace in strings caused by newline (#23272)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-21 13:03:00 +08:00
+								            f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        ]
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
 								    elif modality == "image":
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        prompts = [
-												[Bugfix] Fix extra whitespace in strings caused by newline (#23272)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-21 13:03:00 +08:00
+								            f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        ]
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
 								        max_model_len=16384,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Mantis
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # noqa: E501
 								    prompts = [llama3_template.format(f"{question}\n<image>") for question in questions]
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model="TIGER-Lab/Mantis-8B-siglip-llama3",
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								        max_model_len=4096,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    stop_token_ids = [128009]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								# MiniCPM-V
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								def run_minicpmv_base(questions: list[str], modality: str, model_name):
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    assert modality in ["image", "video"]
 								    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								    # 2.0
 								    # The official repo doesn't work yet, so we need to use a fork for now
 								    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
 								    # model_name = "HwwwH/MiniCPM-V-2"
 								    # 2.5
-												[Model] Rename MiniCPMVQwen2 to MiniCPMV2.6 (#7273)


											
										
										
											2024-08-08 22:02:41 +08:00
+								    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								    # 2.6
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    # model_name = "openbmb/MiniCPM-V-2_6"
 								    # o2.6
 								    # modality supports
 								    # 2.0: image
 								    # 2.5: image
 								    # 2.6: image, video
 								    # o2.6: image, video, audio
 								    # model_name = "openbmb/MiniCPM-o-2_6"
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								        model=model_name,
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								        max_model_len=4096,
 								        max_num_seqs=2,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								        trust_remote_code=True,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    )
-												[Model] Rename MiniCPMVQwen2 to MiniCPMV2.6 (#7273)


											
										
										
											2024-08-08 22:02:41 +08:00
+								    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
 								    # 2.0
 								    # stop_token_ids = [tokenizer.eos_id]
 								    # 2.5
 								    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    # 2.6 / o2.6
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
-												[Model] Rename MiniCPMVQwen2 to MiniCPMV2.6 (#7273)


											
										
										
											2024-08-08 22:02:41 +08:00
+								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    modality_placeholder = {
 								        "image": "(<image>./</image>)",
 								        "video": "(<video>./</video>)",
 								    }
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        tokenizer.apply_chat_template(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            [
 								                {
 								                    "role": "user",
 								                    "content": f"{modality_placeholder[modality]}\n{question}",
 								                }
 								            ],
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            tokenize=False,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            add_generation_prompt=True,
 								        )
 								        for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        stop_token_ids=stop_token_ids,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_minicpmo(questions: list[str], modality: str) -> ModelRequestData:
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-o-2_6")
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
-												[Bugfix] Fix broken Minimax-01-VL model (#22116)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 16:49:29 +08:00
+								def run_minimax_vl_01(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "MiniMaxAI/MiniMax-VL-01"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={modality: 1},
 								        trust_remote_code=True,
 								        tensor_parallel_size=8,
 								    )
 								    tokenizer = AutoTokenizer.from_pretrained(model_name)
 								    messages = [
 								        [
 								            {
 								                "role": "user",
 								                "content": [{"type": "image"}, {"type": "text", "text": question}],
 								            }
 								        ]
 								        for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, add_generation_prompt=True, tokenize=False
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								# Mistral-3 HF-format
 								def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 								    # NOTE: Need L40 (or equivalent) to avoid OOM
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        tensor_parallel_size=2,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												Add ignore consolidated file in mistral example code (#20420)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-07-04 10:55:07 +08:00
+								        ignore_patterns=["consolidated.safetensors"],
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								    )
 								    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								# Molmo
 								def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								    assert modality == "image"
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    model_name = "allenai/Molmo-7B-D-0924"
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
 								    engine_args = EngineArgs(
 								        model=model_name,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        trust_remote_code=True,
 								        dtype="bfloat16",
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    prompts = [
-												[Bugfix] Fix extra whitespace in strings caused by newline (#23272)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-21 13:03:00 +08:00
+								        f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        for question in questions
 								    ]
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												Add Molmo2 multimodal model support (#30997)

Signed-off-by: sanghol <sanghol@allenai.org>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2026-01-13 23:33:09 -08:00
+								# Molmo2
 								def run_molmo2(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "allenai/Molmo2-8B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        dtype="bfloat16",
 								        limit_mm_per_prompt={modality: 1},
 								        max_num_batched_tokens=36864,
 								    )
 								    if modality == "image":
 								        placeholder = "<|image|>"
 								    elif modality == "video":
 								        placeholder = "<|video|>"
 								    else:
 								        raise ValueError(f"Unsupported modality for molmo2: {modality}")
 								    prompts = [
 								        f"{placeholder}<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								# Nemontron_VL
 								def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
-												[VLM][Doc] Add `stop_token_ids` to InternVL example (#7354)


											
										
										
											2024-08-09 22:51:04 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM][Doc] Add `stop_token_ids` to InternVL example (#7354)


											
										
										
											2024-08-09 22:51:04 +08:00
+								        model=model_name,
-												[Model] Initialize support for InternVL2 series models (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-07-29 18:16:30 +08:00
+								        trust_remote_code=True,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        max_model_len=8192,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model] Initialize support for InternVL2 series models (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-07-29 18:16:30 +08:00
+								    )
-												[VLM][Doc] Add `stop_token_ids` to InternVL example (#7354)


											
										
										
											2024-08-09 22:51:04 +08:00
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    assert modality == "image"
 								    placeholder = "<image>"
 								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"{placeholder}\n{question}"}]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ]
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    # Stop tokens for InternVL
 								    # models variants may have different stop tokens
 								    # please refer to the model card for the correct "stop words":
 								    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
 								    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 								    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        stop_token_ids=stop_token_ids,
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    )
-												[Model] Initialize support for InternVL2 series models (#6514)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-07-29 18:16:30 +08:00
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								# NVLM-D
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    assert modality == "image"
 								    model_name = "nvidia/NVLM-D-72B"
 								    # Adjust this as necessary to fit in GPU
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        tensor_parallel_size=4,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
-												[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-05-12 08:56:30 +08:00
+								# Ovis
 								def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
-												[MODEL ADDITION] Ovis2 Model Addition (#15826)

Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-04-30 09:33:29 +02:00
+								    assert modality == "image"
 								    model_name = "AIDC-AI/Ovis2-1B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        trust_remote_code=True,
 								        dtype="half",
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[MODEL ADDITION] Ovis2 Model Addition (#15826)

Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-04-30 09:33:29 +02:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[MODEL ADDITION] Ovis2 Model Addition (#15826)

Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-04-30 09:33:29 +02:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 21:12:59 +08:00
+								# Ovis2_5
 								def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "AIDC-AI/Ovis2.5-2B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        trust_remote_code=True,
 								        dtype="half",
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    if modality == "image":
 								        placeholder = "<image>"
 								    elif modality == "video":
 								        placeholder = "<video>"
-												[Model] Use `merge_by_field_config` for MM models (Ovis family) (#26308)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-10-07 20:54:22 +08:00
+								    prompts = [
 								        f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
-												[Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 21:12:59 +08:00
+								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Add PaddleOCR-VL Model Support  (#27758)

Signed-off-by: zhangyue <zhangyue66@baidu.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zhangyue66 <zhangyue66@baidu.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-11-03 19:04:22 +08:00
+								# PaddleOCR-VL
 								def run_paddleocr_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "PaddlePaddle/PaddleOCR-VL"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={modality: 1},
 								        trust_remote_code=True,
 								    )
 								    placeholder = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
 								    prompts = [
 								        (f"<|begin_of_sentence|>User: {question}{placeholder}\nAssistant: ")
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# PaliGemma
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[Model] Initial support for BLIP-2 (#5920)

Co-authored-by: ywang96 <ywang@roblox.com>
											
										
										
											2024-07-27 19:53:07 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # PaliGemma has special prompt format for VQA
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    prompts = ["caption en" for _ in questions]
 								    engine_args = EngineArgs(
 								        model="google/paligemma-3b-mix-224",
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Initial support for BLIP-2 (#5920)

Co-authored-by: ywang96 <ywang@roblox.com>
											
										
										
											2024-07-27 19:53:07 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# PaliGemma 2
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-05 06:48:10 -06:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # PaliGemma 2 has special prompt format for VQA
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    prompts = ["caption en" for _ in questions]
 								    engine_args = EngineArgs(
 								        model="google/paligemma2-3b-ft-docci-448",
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-05 06:48:10 -06:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Phi-3-Vision
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    assert modality == "image"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
 								        f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
 								        for question in questions
 								    ]
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # num_crops is an override kwarg to the multimodal image processor;
 								    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
 								    # to use 16 for single frame scenarios, and 4 for multi-frame.
 								    #
 								    # Generally speaking, a larger value for num_crops results in more
 								    # tokens per image instance, because it may scale the image more in
 								    # the image preprocessing. Some references in the model docs and the
 								    # formula for image tokens after the preprocessing
 								    # transform can be found below.
 								    #
 								    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
 								    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model="microsoft/Phi-3.5-vision-instruct",
 								        trust_remote_code=True,
-												[CI/Build] Add Model Tests for Qwen2-VL (#9846)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-10-31 10:10:52 -06:00
+								        max_model_len=4096,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        max_num_seqs=2,
-												[Model] Add min_pixels / max_pixels to Qwen2VL as mm_processor_kwargs (#9612)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>

											
										
										
											2024-10-23 08:05:18 -06:00
+								        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        mm_processor_kwargs={"num_crops": 16},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								# Phi-4-multimodal-instruct
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    """
 								    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
 								    show how to process image inputs.
 								    """
 								    assert modality == "image"
 								    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 								    # Since the vision-lora and speech-lora co-exist with the base model,
 								    # we have to manually specify the path of the lora weights.
 								    vision_lora_path = os.path.join(model_path, "vision-lora")
 								    prompts = [
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        f"<|user|><|image_1|>{question}<|end|><|assistant|>" for question in questions
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								        model=model_path,
 								        trust_remote_code=True,
-												[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-19 17:26:11 +08:00
+								        max_model_len=5120,
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								        max_num_seqs=2,
-												[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-19 17:26:11 +08:00
+								        max_num_batched_tokens=12800,
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								        enable_lora=True,
 								        max_lora_rank=320,
-												[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-19 17:26:11 +08:00
+								        # Note - mm_processor_kwargs can also be passed to generate/chat calls
 								        mm_processor_kwargs={"dynamic_hd": 16},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
 								    )
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								# Pixtral HF-format
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								    assert modality == "image"
 								    model_name = "mistral-community/pixtral-12b"
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								    # NOTE: Need L40 (or equivalent) to avoid OOM
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								        model=model_name,
-												[Bugfix] Check dimensions of multimodal embeddings in V1 (#15816)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-01 00:01:35 +08:00
+								        max_model_len=6144,
-												[V1][VLM] V1 support for selected single-image models. (#11632)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-31 13:17:22 -08:00
+								        max_num_seqs=2,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
+								    )
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Support Pixtral models in the HF Transformers format (#9036)


											
										
										
											2024-10-18 15:29:56 -04:00
-												[V1] Support any head size for FlexAttention backend (#20467)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-07-07 00:54:36 +08:00
+								# Qwen-VL
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
+								    assert modality == "image"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model="Qwen/Qwen-VL",
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
+								        trust_remote_code=True,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        max_model_len=1024,
 								        max_num_seqs=2,
-												[VLM] Separate text-only and vision variants of the same model architecture (#13157)


											
										
										
											2025-02-13 22:19:15 +08:00
+								        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
+								    )
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Molmo vLLM Integration (#9016)

Co-authored-by: sanghol <sanghol@allenai.org>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-10-14 07:56:24 -07:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								# Qwen2-VL
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    model_name = "Qwen/Qwen2-VL-7B-Instruct"
-												[Model] Add Idefics3 support (#9767)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: B-201 <Joy25810@foxmail.com>
Co-authored-by: B-201 <Joy25810@foxmail.com>
											
										
										
											2024-11-06 19:41:17 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								        model=model_name,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        max_model_len=4096,
 								        max_num_seqs=5,
 								        # Note - mm_processor_kwargs can also be passed to generate/chat calls
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								        mm_processor_kwargs={
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								            "min_pixels": 28 * 28,
 								            "max_pixels": 1280 * 28 * 28,
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								        },
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								    )
-												[Model] Add Idefics3 support (#9767)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: B-201 <Joy25810@foxmail.com>
Co-authored-by: B-201 <Joy25810@foxmail.com>
											
										
										
											2024-11-06 19:41:17 +08:00
-												[Model] Refactor Qwen2-VL to use merged multimodal processor (#11258)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-12-20 00:28:00 +08:00
+								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        (
 								            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Update multi-modal processor to support Mantis(LLaVA) model (#10711)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-12-08 01:10:05 +08:00
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								# Qwen2.5-VL
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=5,
 								        mm_processor_kwargs={
 								            "min_pixels": 28 * 28,
 								            "max_pixels": 1280 * 28 * 28,
 								            "fps": 1,
 								        },
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    )
 								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    prompts = [
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        (
 								            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    ]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
-												[Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
											
										
										
											2025-04-19 14:14:36 +08:00
+								# Qwen2.5-Omni
 								def run_qwen2_5_omni(questions: list[str], modality: str):
 								    model_name = "Qwen/Qwen2.5-Omni-7B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=5,
 								        mm_processor_kwargs={
 								            "min_pixels": 28 * 28,
 								            "max_pixels": 1280 * 28 * 28,
-												[Bugfix] Fix FPS value type for Qwen2.5-Omni video processing (#28630)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-13 21:06:06 +08:00
+								            "fps": 1,
-												[Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
											
										
										
											2025-04-19 14:14:36 +08:00
+								        },
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
											
										
										
											2025-04-19 14:14:36 +08:00
+								    )
 								    if modality == "image":
 								        placeholder = "<|IMAGE|>"
 								    elif modality == "video":
 								        placeholder = "<|VIDEO|>"
 								    default_system = (
 								        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
 								        "Group, capable of perceiving auditory and visual inputs, as well as "
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        "generating text and speech."
 								    )
-												[Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
											
										
										
											2025-04-19 14:14:36 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompts = [
 								        (
 								            f"<|im_start|>system\n{default_system}<|im_end|>\n"
 								            f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
-												[Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
											
										
										
											2025-04-19 14:14:36 +08:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model] Support Qwen3-VL Model Series (#24727)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-09-16 22:01:04 -07:00
+								# Qwen3-VL-Dense
 								def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "Qwen/Qwen3-VL-4B-Instruct"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=5,
 								        mm_processor_kwargs={
 								            "min_pixels": 28 * 28,
 								            "max_pixels": 1280 * 28 * 28,
 								            "fps": 1,
 								        },
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
 								    prompts = [
 								        (
 								            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
 								# Qwen3-VL-MOE
 								def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=5,
 								        mm_processor_kwargs={
 								            "min_pixels": 28 * 28,
 								            "max_pixels": 1280 * 28 * 28,
 								            "fps": 1,
 								        },
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
 								    prompts = [
 								        (
 								            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Model][VLM] Support R-4B Model (#23246)

Signed-off-by: yannqi <yannqi@qq.com>
Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: yannqiyang <yannqiyang@tencent.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-08-21 12:08:52 +08:00
+								# R-4B
 								def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "YannQi/R-4B"
 								    prompts = [
 								        f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
 								        for question in questions
 								    ]
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=16384,
 								        limit_mm_per_prompt={modality: 1},
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								# SkyworkR1V
 								def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
-												[Misc] Automatically resolve HF processor init kwargs (#22005)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-01 13:44:10 +08:00
+								    assert modality == "image"
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
 								    model_name = "Skywork/Skywork-R1V-38B"
-												[Misc] Automatically resolve HF processor init kwargs (#22005)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-01 13:44:10 +08:00
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        limit_mm_per_prompt={modality: 1},
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
 								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    messages = [
 								        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
 								    ]
 								    prompts = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    # Stop tokens for SkyworkR1V
 								    # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
 								    stop_tokens = ["<｜end▁of▁sentence｜>", "<|endoftext|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-												[Misc] Automatically resolve HF processor init kwargs (#22005)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-01 13:44:10 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        stop_token_ids=stop_token_ids,
-												[Misc] Automatically resolve HF processor init kwargs (#22005)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-01 13:44:10 +08:00
+								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								# SmolVLM2-2.2B-Instruct
 								def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
 								    engine_args = EngineArgs(
 								        model=model_name,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        max_model_len=8192,
 								        max_num_seqs=2,
 								        enforce_eager=True,
 								        mm_processor_kwargs={
 								            "max_image_size": {"longest_edge": 384},
 								        },
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
+								        limit_mm_per_prompt={modality: 1},
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    prompts = [
 								        (f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
 								        for question in questions
 								    ]
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
 								# Step3
 								def run_step3(questions: list[str], modality: str) -> ModelRequestData:
 								    assert modality == "image"
 								    model_name = "stepfun-ai/step3-fp8"
 								    # NOTE: Below are verified configurations for step3-fp8
 								    # on 8xH100 GPUs.
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_num_batched_tokens=4096,
 								        gpu_memory_utilization=0.85,
 								        tensor_parallel_size=8,
 								        limit_mm_per_prompt={modality: 1},
 								        reasoning_parser="step3",
 								    )
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
 								    prompts = [
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
 								        f"<im_patch>{question} <|EOT|><|BOT|>assistant\n<think>\n"
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
+								        for question in questions
 								    ]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								# omni-research/Tarsier-7b
 								def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
+								    assert modality == "image"
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    model_name = "omni-research/Tarsier-7b"
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
-												[Bugfix] Fix modality limits in vision language example (#17721)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-05-07 00:12:28 +08:00
+								        limit_mm_per_prompt={modality: 1},
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
+								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    prompts = [(f"USER: <image>\n{question} ASSISTANT:") for question in questions]
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    )
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
 								def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
 								    model_name = "omni-research/Tarsier2-Recap-7b"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
-												Fix some more Transformers nightly tests (#29872)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-12-02 21:49:44 +00:00
+								        hf_overrides={
 								            "architectures": ["Tarsier2ForConditionalGeneration"],
 								            "model_type": "tarsier2",
 								        },
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        limit_mm_per_prompt={modality: 1},
 								    )
 								    if modality == "image":
 								        placeholder = "<|image_pad|>"
 								    elif modality == "video":
 								        placeholder = "<|video_pad|>"
 								    prompts = [
 								        (
 								            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
 								            f"{question}<|im_end|>\n"
 								            "<|im_start|>assistant\n"
 								        )
 								        for question in questions
 								    ]
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompts=prompts,
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								model_example_map = {
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "aria": run_aria,
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								    "aya_vision": run_aya_vision,
-												[New Model] BAGEL support (AR only) (#28439)

Signed-off-by: princepride <wangzhipeng628@gmail.com>
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-12-15 14:58:23 +08:00
+								    "bagel": run_bagel,
-												[Model][VLM] Support Bee-8B Model (#27012)

Signed-off-by: uyzhang <yi.zhang.4096@gmail.com>
Signed-off-by: Yi Zhang <zhangyi970819@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-20 10:31:26 +08:00
+								    "bee": run_bee,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "blip-2": run_blip2,
 								    "chameleon": run_chameleon,
-												[New Model] Support Command-A-Vision (#22660)

Signed-off-by: donglu <donglu@cohere.com>
											
										
										
											2025-08-12 04:39:54 -04:00
+								    "command_a_vision": run_command_a_vision,
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								    "deepseek_vl_v2": run_deepseek_vl2,
-												[Model] Upstream Deepseek-OCR model (#27247)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-22 22:59:15 +08:00
+								    "deepseek_ocr": run_deepseek_ocr,
 								    "dots_ocr": run_dots_ocr,
-												[Model] Add Eagle2.5-8B Vision-Language Model support   (#32456)

Signed-off-by: kimheesu <wlskaka4@gmail.com>
											
										
										
											2026-01-21 18:39:53 +09:00
+								    "eagle2_5": run_eagle2_5,
-												[Model] Add Ernie4.5 VL Model Support (#22514)

Signed-off-by: wangyafeng <wangyafeng@baidu.com>
											
										
										
											2025-08-27 12:02:55 +08:00
+								    "ernie45_vl": run_ernie45_vl,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "fuyu": run_fuyu,
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								    "gemma3": run_gemma3,
-												[Model] Gemma3n MM (#20495)

Signed-off-by: ShriKode <shrikode@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: ShriKode <shrikode@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-09 18:56:25 +02:00
+								    "gemma3n": run_gemma3n,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "glm4v": run_glm4v,
-												Add GLM4.1V model (Draft) (#19331)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-01 20:48:26 +08:00
+								    "glm4_1v": run_glm4_1v,
-												Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-08-19 16:56:31 +09:00
+								    "glm4_5v": run_glm4_5v,
 								    "glm4_5v_fp8": run_glm4_5v_fp8,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "h2ovl_chat": run_h2ovl,
-												[Model] Add HunyuanOCR support (#29327)

Signed-off-by: manayang <jackmanayang@gmail.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: sergeywang <sergeywang@tencent.com>
Co-authored-by: manayang <jackmanayang@gmail.com>
Co-authored-by: manayang <manayang@tencent.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-11-25 11:28:51 +08:00
+								    "hunyuan_vl": run_hunyuan_vl,
-												[MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)

Signed-off-by: bigshanedogg <bigshane319@gmail.com>
											
										
										
											2025-07-25 22:05:42 +09:00
+								    "hyperclovax_seed_vision": run_hyperclovax_seed_vision,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "idefics3": run_idefics3,
-												Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-26 19:14:04 +08:00
+								    "interns1": run_interns1,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "internvl_chat": run_internvl,
-												[MODEL] New model support for kakaocorp/kanana-1.5-v-3b-instruct (#29384)

Signed-off-by: Jaehyun An <steve.ai@kakaocorp.com>
											
										
										
											2026-01-13 01:39:02 +09:00
+								    "kanana_v": run_kanana_v,
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
+								    "keye_vl": run_keye_vl,
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
+								    "keye_vl1_5": run_keye_vl1_5,
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
+								    "kimi_vl": run_kimi_vl,
-												[Model] Add support for LightOnOCR (#26916)

Signed-off-by: Said Taghadouini <taghadouinisaid@gmail.com>
Signed-off-by: Said Taghadouini <84044788+staghado@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-10-17 07:05:24 +02:00
+								    "lightonocr": run_lightonocr,
-												[Model] Add LFM2-VL model support (#31758)

Signed-off-by: Tianshu Yu <tianshuyu.formal@gmail.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2026-01-08 05:00:27 -08:00
+								    "lfm2_vl": run_lfm2_vl,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    "llama4": run_llama4,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    "llava": run_llava,
 								    "llava-next": run_llava_next,
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								    "llava-next-video": run_llava_next_video,
-												[Model][VLM] Add LLaVA-Onevision model support (#8486)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-23 01:51:44 +08:00
+								    "llava-onevision": run_llava_onevision,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "mantis": run_mantis,
-												[Model] Refactoring of MiniCPM-V and add MiniCPM-o-2.6 support for vLLM (#12069)

Signed-off-by: hzh <hezhihui_thu@163.com>
Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Signed-off-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
Signed-off-by: Akshat Tripathi <akshat@krai.ai>
Signed-off-by: Oleg Mosalov <oleg@krai.ai>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Yida Wu <yidawu@alumni.cmu.edu>
Signed-off-by: Chenguang Li <757486878@qq.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Shanshan Shen <467638484@qq.com>
Signed-off-by: elijah <f1renze.142857@gmail.com>
Signed-off-by: Yikun <yikunkero@gmail.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Konrad Zawora <kzawora@habana.ai>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Co-authored-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com>
Co-authored-by: shaochangxu <85155497+shaochangxu@users.noreply.github.com>
Co-authored-by: shaochangxu.scx <shaochangxu.scx@antgroup.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: sixgod <evethwillbeok@outlook.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Rafael Vasquez <rafvasq21@gmail.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Akshat Tripathi <Akshat.tripathi6568@gmail.com>
Co-authored-by: Oleg Mosalov <oleg@krai.ai>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Avshalom Manevich <12231371+avshalomman@users.noreply.github.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Co-authored-by: Yangcheng Li <liyangcheng.lyc@alibaba-inc.com>
Co-authored-by: Siyuan Li <94890248+liaoyanqing666@users.noreply.github.com>
Co-authored-by: Concurrensee <yida.wu@amd.com>
Co-authored-by: Chenguang Li <757486878@qq.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Alex Brooks <alex.brooks@ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Shanshan Shen <467638484@qq.com>
Co-authored-by: elijah <30852919+e1ijah1@users.noreply.github.com>
Co-authored-by: Yikun Jiang <yikunkero@gmail.com>
Co-authored-by: Steve Luo <36296769+SunflowerAries@users.noreply.github.com>
Co-authored-by: mgoin <michael@neuralmagic.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Konrad Zawora <kzawora@habana.ai>
Co-authored-by: TJian <tunjian1996@gmail.com>
Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: wangxiyuan <wangxiyuan1007@gmail.com>
Co-authored-by: maang-h <55082429+maang-h@users.noreply.github.com>
Co-authored-by: Elfie Guo <164945471+elfiegg@users.noreply.github.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-01-29 17:24:59 +08:00
+								    "minicpmo": run_minicpmo,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    "minicpmv": run_minicpmv,
-												[Bugfix] Fix broken Minimax-01-VL model (#22116)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 16:49:29 +08:00
+								    "minimax_vl_01": run_minimax_vl_01,
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								    "mistral3": run_mistral3,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "molmo": run_molmo,
-												Add Molmo2 multimodal model support (#30997)

Signed-off-by: sanghol <sanghol@allenai.org>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2026-01-13 23:33:09 -08:00
+								    "molmo2": run_molmo2,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    "nemotron_vl": run_nemotron_vl,
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    "NVLM_D": run_nvlm_d,
-												[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-05-12 08:56:30 +08:00
+								    "ovis": run_ovis,
-												[Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 21:12:59 +08:00
+								    "ovis2_5": run_ovis2_5,
-												[Model] Add PaddleOCR-VL Model Support  (#27758)

Signed-off-by: zhangyue <zhangyue66@baidu.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zhangyue66 <zhangyue66@baidu.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-11-03 19:04:22 +08:00
+								    "paddleocr_vl": run_paddleocr_vl,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "paligemma": run_paligemma,
 								    "paligemma2": run_paligemma2,
 								    "phi3_v": run_phi3v,
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    "phi4_mm": run_phi4mm,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "pixtral_hf": run_pixtral_hf,
-												[MODEL] Qwen Multimodal Support (Qwen-VL / Qwen-VL-Chat) (#8029)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-05 06:48:10 -06:00
+								    "qwen_vl": run_qwen_vl,
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								    "qwen2_vl": run_qwen2_vl,
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    "qwen2_5_vl": run_qwen2_5_vl,
-												[Model][VLM] Add Qwen2.5-Omni model support (thinker only) (#15130)

Signed-off-by: fyabc <suyang.fy@alibaba-inc.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Xiong Wang <wangxiongts@163.com>
											
										
										
											2025-04-19 14:14:36 +08:00
+								    "qwen2_5_omni": run_qwen2_5_omni,
-												[Model] Support Qwen3-VL Model Series (#24727)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-09-16 22:01:04 -07:00
+								    "qwen3_vl": run_qwen3_vl,
 								    "qwen3_vl_moe": run_qwen3_vl_moe,
-												[Model][VLM] Support R-4B Model (#23246)

Signed-off-by: yannqi <yannqi@qq.com>
Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: yannqiyang <yannqiyang@tencent.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-08-21 12:08:52 +08:00
+								    "rvl": run_r_vl,
-												[Model] Support Skywork-R1V (#15397)

Signed-off-by: jiacai.liu <932997367@qq.com>
Co-authored-by: jiacai.liu <932997367@qq.com>
											
										
										
											2025-03-29 11:39:21 +08:00
+								    "skywork_chat": run_skyworkr1v,
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								    "smolvlm": run_smolvlm,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    "step3": run_step3,
-												Add tarsier model support (#18985)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-03 13:13:13 +08:00
+								    "tarsier": run_tarsier,
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
+								    "tarsier2": run_tarsier2,
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								}
-												[Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (#24822)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-09-15 20:45:06 +08:00
+								MODELS_NEED_VIDEO_METADATA = [
 								    "glm4_1v",
 								    "glm4_5v",
 								    "glm4_5v_fp8",
-												Add Molmo2 multimodal model support (#30997)

Signed-off-by: sanghol <sanghol@allenai.org>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2026-01-13 23:33:09 -08:00
+								    "molmo2",
-												[Model] Support Qwen3-VL Model Series (#24727)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Huang Jie <92386084+JJJYmmm@users.noreply.github.com>
Co-authored-by: 松灵 <26085463+wulipc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-09-16 22:01:04 -07:00
+								    "qwen3_vl",
 								    "qwen3_vl_moe",
-												[Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (#24822)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-09-15 20:45:06 +08:00
+								]
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								def get_multi_modal_input(args):
 								    """
 								    return {
 								        "data": image or video,
 								        "question": question,
 								    }
 								    """
 								    if args.modality == "image":
 								        # Input image and question
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        img_questions = [
 								            "What is the content of this image?",
 								            "Describe the content of this image in detail.",
 								            "What's in the image?",
 								            "Where is this image taken?",
 								        ]
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
 								        return {
 								            "data": image,
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            "questions": img_questions,
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								        }
 								    if args.modality == "video":
 								        # Input video and question
-												[Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (#24822)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-09-15 20:45:06 +08:00
+								        needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
-												Add GLM4.1V model (Draft) (#19331)

Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-01 20:48:26 +08:00
+								        metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								        vid_questions = ["Why is this video funny?"]
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
 								        return {
-												[Bugfix] Fix GLM4.1V multimodal processor with compatability for Transformers v4.56 (#24822)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-09-15 20:45:06 +08:00
+								            "data": ([(video, metadata)] if needs_metadata else video),
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            "questions": vid_questions,
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								        }
 								    msg = f"Modality {args.modality} is not supported."
 								    raise ValueError(msg)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								def apply_image_repeat(
 								    image_repeat_prob, num_prompts, data, prompts: list[str], modality
 								):
 								    """Repeats images with provided probability of "image_repeat_prob".
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								    Used to simulate hit/miss for the MM preprocessor cache.
 								    """
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								    no_yes = [0, 1]
 								    probs = [1.0 - image_repeat_prob, image_repeat_prob]
 								    inputs = []
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								    inputs_with_empty_media = []
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								    cur_image = data
 								    for i in range(num_prompts):
 								        if image_repeat_prob is not None:
 								            res = random.choices(no_yes, probs)[0]
 								            if res == 0:
 								                # No repeat => Modify one pixel
 								                cur_image = cur_image.copy()
 								                new_val = (i // 256 // 256, i // 256, i % 256)
 								                cur_image.putpixel((0, 0), new_val)
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								        uuid = "uuid_{}".format(i)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        inputs.append(
 								            {
 								                "prompt": prompts[i % len(prompts)],
 								                "multi_modal_data": {modality: cur_image},
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								                "multi_modal_uuids": {modality: uuid},
 								            }
 								        )
 								        inputs_with_empty_media.append(
 								            {
 								                "prompt": prompts[i % len(prompts)],
 								                "multi_modal_data": {modality: None},
 								                "multi_modal_uuids": {modality: uuid},
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								            }
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        )
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								    return inputs, inputs_with_empty_media
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								@contextmanager
 								def time_counter(enable: bool):
 								    if enable:
 								        import time
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								        start_time = time.time()
 								        yield
 								        elapsed_time = time.time() - start_time
 								        print("-" * 50)
 								        print("-- generate time = {}".format(elapsed_time))
 								        print("-" * 50)
 								    else:
 								        yield
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								def parse_args():
 								    parser = FlexibleArgumentParser(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        description="Demo on using vLLM for offline inference with "
 								        "vision language models for text generation"
 								    )
 								    parser.add_argument(
 								        "--model-type",
 								        "-m",
 								        type=str,
 								        default="llava",
 								        choices=model_example_map.keys(),
 								        help='Huggingface "model_type".',
 								    )
 								    parser.add_argument(
 								        "--num-prompts", type=int, default=4, help="Number of prompts to run."
 								    )
 								    parser.add_argument(
 								        "--modality",
 								        type=str,
 								        default="image",
 								        choices=["image", "video"],
 								        help="Modality of the input.",
 								    )
 								    parser.add_argument(
 								        "--num-frames",
 								        type=int,
 								        default=16,
 								        help="Number of frames to extract from the video.",
 								    )
 								    parser.add_argument(
 								        "--seed",
 								        type=int,
-												[Deprecation] Remove deprecated task, seed and MM settings (#30397)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-12-11 11:59:39 +08:00
+								        default=0,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        help="Set the seed when initializing `vllm.LLM`.",
 								    )
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
 								    parser.add_argument(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        "--image-repeat-prob",
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								        type=float,
 								        default=None,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)",
 								    )
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
 								    parser.add_argument(
-												[Frontend] Use engine argument to control MM cache size (#22441)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-08 00:47:10 +08:00
+								        "--disable-mm-processor-cache",
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        action="store_true",
-												[Core] Store only the keys for multi-modal data in P0 (#22198)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-07 16:45:04 +08:00
+								        help="If True, disables caching of multi-modal processor.",
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    )
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
 								    parser.add_argument(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        "--time-generate",
 								        action="store_true",
 								        help="If True, then print the total generate() call time",
 								    )
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
 								    parser.add_argument(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        "--use-different-prompt-per-request",
 								        action="store_true",
 								        help="If True, then use different prompt (with the same multi-modal "
 								        "data) for each request.",
 								    )
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
 								    parser.add_argument(
 								        "--verify-mm-cache-hit-with-uuids",
 								        action="store_true",
 								        help="If True, will send all requests in a second batch with empty mm "
 								        "data to verify cache hits with UUIDs.",
 								    )
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    parser.add_argument(
 								        "--tensor-parallel-size",
 								        "-tp",
 								        type=int,
 								        default=None,
 								        help="Tensor parallel size to override the model's default setting. ",
 								    )
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								    return parser.parse_args()
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								def main(args):
 								    model = args.model_type
 								    if model not in model_example_map:
 								        raise ValueError(f"Model type {model} is not supported.")
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
 								        raise ValueError(
 								            f"tensor_parallel_size must be a positive integer, "
 								            f"got {args.tensor_parallel_size}"
 								        )
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
+								    modality = args.modality
 								    mm_input = get_multi_modal_input(args)
 								    data = mm_input["data"]
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    questions = mm_input["questions"]
-												[model] Support for Llava-Next-Video model (#7559)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2024-09-11 13:21:36 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    req_data = model_example_map[model](questions, modality)
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								    # Disable other modalities to save memory
 								    default_limits = {"image": 0, "video": 0, "audio": 0}
 								    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        req_data.engine_args.limit_mm_per_prompt or {}
 								    )
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
 								    engine_args = asdict(req_data.engine_args) | {
 								        "seed": args.seed,
-												[Frontend] Use engine argument to control MM cache size (#22441)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-08-08 00:47:10 +08:00
+								        "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								    }
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    if args.tensor_parallel_size is not None:
 								        engine_args["tensor_parallel_size"] = args.tensor_parallel_size
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    llm = LLM(**engine_args)
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								    # Don't want to check the flag multiple times, so just hijack `prompts`.
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompts = (
 								        req_data.prompts
 								        if args.use_different_prompt_per_request
 								        else [req_data.prompts[0]]
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								    # We set temperature to 0.2 so that outputs can be different
 								    # even when all prompts are identical when running batch inference.
-												[Model] Upstream Deepseek-OCR model (#27247)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-22 22:59:15 +08:00
+								    sampling_params = (
 								        SamplingParams(
 								            temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
 								        )
 								        if req_data.sampling_params is None
 								        else req_data.sampling_params
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								    assert args.num_prompts > 0
 								    if args.num_prompts == 1:
 								        # Single inference
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								        uuid = "uuid_0"
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								        inputs = {
-												[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)


											
										
										
											2025-03-04 07:43:59 -08:00
+								            "prompt": prompts[0],
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            "multi_modal_data": {modality: data},
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								            "multi_modal_uuids": {modality: uuid},
 								        }
 								        inputs_with_empty_media = {
 								            "prompt": prompts[0],
 								            "multi_modal_data": {modality: None},
 								            "multi_modal_uuids": {modality: uuid},
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								        }
 								    else:
 								        # Batch inference
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								        if args.image_repeat_prob is not None:
 								            # Repeat images with specified probability of "image_repeat_prob"
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								            inputs, inputs_with_empty_media = apply_image_repeat(
 								                args.image_repeat_prob,
 								                args.num_prompts,
 								                data,
 								                prompts,
 								                modality,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            )
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
+								        else:
 								            # Use the same image for all prompts
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								            inputs = []
 								            inputs_with_empty_media = []
 								            for i in range(args.num_prompts):
 								                uuid = "uuid_{}".format(i)
 								                inputs.append(
 								                    {
 								                        "prompt": prompts[i % len(prompts)],
 								                        "multi_modal_data": {modality: data},
 								                        "multi_modal_uuids": {modality: uuid},
 								                    }
 								                )
 								                inputs_with_empty_media.append(
 								                    {
 								                        "prompt": prompts[i % len(prompts)],
 								                        "multi_modal_data": {modality: None},
 								                        "multi_modal_uuids": {modality: uuid},
 								                    }
 								                )
-												[V1] VLM preprocessor hashing (#11020)

Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Alexander Matveev <alexm@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-12-11 19:55:30 -05:00
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								    # Add LoRA request if applicable
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    lora_request = (
 								        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
 								    )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								    with time_counter(args.time_generate):
 								        outputs = llm.generate(
 								            inputs,
 								            sampling_params=sampling_params,
 								            lora_request=lora_request,
 								        )
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								    print("-" * 50)
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
+								    for o in outputs:
 								        generated_text = o.outputs[0].text
 								        print(generated_text)
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								        print("-" * 50)
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
-												[Frontend][Multimodal] Allow skipping media data when UUIDs are provided.  (#23950)

Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-09-12 19:16:06 -07:00
+								    if args.verify_mm_cache_hit_with_uuids:
 								        try:
 								            # Verify cache hits with UUIDs
 								            print(
 								                "Sending a second batch of requests with empty media"
 								                " and matching UUIDs."
 								            )
 								            outputs = llm.generate(
 								                inputs_with_empty_media,
 								                sampling_params=sampling_params,
 								                lora_request=lora_request,
 								            )
 								            print("-" * 50)
 								            for o in outputs:
 								                generated_text = o.outputs[0].text
 								                print(generated_text)
 								                print("-" * 50)
 								        except Exception as e:
 								            print(f"Failed to verify cache hits with UUIDs. Error: {e}")
-												[Misc][VLM][Doc] Consolidate offline examples for vision language models (#6858)

Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2024-07-26 22:44:13 -07:00
 								if __name__ == "__main__":
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								    args = parse_args()
-												[Model] Expose size to Idefics3 as mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-11-08 17:56:58 +08:00
+								    main(args)