examples/offline_inference/vision_language_multi_image.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models for text generation,
using the chat template defined by the model.
"""

import os
from argparse import Namespace
from dataclasses import asdict
from typing import NamedTuple

from huggingface_hub import snapshot_download
from PIL.Image import Image
from transformers import AutoProcessor, AutoTokenizer

from vllm import LLM, EngineArgs, SamplingParams
from vllm.lora.request import LoRARequest
from vllm.multimodal.utils import fetch_image
from vllm.utils.argparse_utils import FlexibleArgumentParser

QUESTION = "What is the content of each image?"
IMAGE_URLS = [
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
]


class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
    prompt: str
    image_data: list[Image]
    stop_token_ids: list[int] | None = None
    chat_template: str | None = None
    lora_requests: list[LoRARequest] | None = None
    sampling_params: SamplingParams | None = None


# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.


def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "rhymes-ai/Aria"
    engine_args = EngineArgs(
        model=model_name,
        tokenizer_mode="slow",
        trust_remote_code=True,
        dtype="bfloat16",
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
    prompt = (
        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
    )
    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereLabs/aya-vision-8b"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_bee(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Open-Bee/Bee-8B-RL"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
        trust_remote_code=True,
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "CohereLabs/command-a-vision-07-2025"

    # NOTE: This model is 122B parameters and requires tensor parallelism
    # Recommended to use tp=4 on H100 GPUs
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "deepseek-ai/deepseek-vl2-tiny"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholder = "".join(
        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_deepseek_ocr(question: str, image_urls: list[str]) -> ModelRequestData:
    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor

    model_name = "deepseek-ai/DeepSeek-OCR"

    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        logits_processors=[NGramPerReqLogitsProcessor],
    )

    placeholder = "<image>\n" * len(image_urls)
    prompt = placeholder + question

    # The following sampling params config is taken from
    # the official Deepseek-OCR inference example.
    # (IMPORTANT) Use the custom logits processor and avoid skipping
    # special tokens for this model for the optimal OCR performance.
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=8192,
        # ngram logit processor args
        extra_args=dict(
            ngram_size=30,
            window_size=90,
            # whitelist: <td>, </td>
            whitelist_token_ids={128821, 128822},
        ),
        skip_special_tokens=False,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
        sampling_params=sampling_params,
    )


def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "google/gemma-3-4b-it"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "h2oai/h2ovl-mississippi-800m"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for H2OVL-Mississippi
    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
    stop_token_ids = [tokenizer.eos_token_id]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


# HunyuanOCR
def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "tencent/HunyuanOCR"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholder = (
        "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
    ) * len(image_urls)
    prompt = f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_hyperclovax_seed_vision(
    question: str, image_urls: list[str]
) -> ModelRequestData:
    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=16384,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    message = {"role": "user", "content": list()}
    for _image_url in image_urls:
        message["content"].append(
            {
                "type": "image",
                "image": _image_url,
                "ocr": "",
                "lens_keywords": "",
                "lens_local_keywords": "",
            }
        )
    message["content"].append(
        {
            "type": "text",
            "text": question,
        }
    )

    prompt = tokenizer.apply_chat_template(
        [
            message,
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=None,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        # if you are running out of memory, you can reduce the "longest_edge".
        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
        mm_processor_kwargs={
            "size": {"longest_edge": 2 * 364},
        },
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "internlm/Intern-S1-mini"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "\n".join(
        f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL2-2B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    # Stop tokens for InternVL
    # models variants may have different stop tokens
    # please refer to the model card for the correct "stop words":
    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-1_5-8B"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=32768,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "moonshotai/Kimi-VL-A3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=4,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=131072,
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
    # it will generate poor response for multi-image inputs!
    model_name = "llava-hf/llava-1.5-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        max_num_seqs=16,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        ignore_patterns=["consolidated.safetensors"],
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "nvidia/NVLM-D-72B"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        tensor_parallel_size=4,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"max_dynamic_patch": 4},
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


# Ovis
def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "AIDC-AI/Ovis2-1B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


# ovis2_5
def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "AIDC-AI/Ovis2.5-2B"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>user\n\n{placeholders}\n{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "PaddlePaddle/PaddleOCR-VL"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
    prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "mistral-community/pixtral-12b"

    # Adjust this as necessary to fit in GPU
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=2,
        tensor_parallel_size=2,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = "[IMG]" * len(image_urls)
    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
    # num_crops is an override kwarg to the multimodal image processor;
    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
    # to use 16 for single frame scenarios, and 4 for multi-frame.
    #
    # Generally speaking, a larger value for num_crops results in more
    # tokens per image instance, because it may scale the image more in
    # the image preprocessing. Some references in the model docs and the
    # formula for image tokens after the preprocessing
    # transform can be found below.
    #
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
    engine_args = EngineArgs(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={"num_crops": 4},
    )
    placeholders = "\n".join(
        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
    """
    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
    show how to process multi images inputs.
    """

    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
    # Since the vision-lora and speech-lora co-exist with the base model,
    # we have to manually specify the path of the lora weights.
    vision_lora_path = os.path.join(model_path, "vision-lora")
    engine_args = EngineArgs(
        model=model_path,
        trust_remote_code=True,
        max_model_len=4096,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enable_lora=True,
        max_lora_rank=320,
        # Note - mm_processor_kwargs can also be passed to generate/chat calls
        mm_processor_kwargs={"dynamic_hd": 4},
    )

    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
    )


def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "Qwen/Qwen-VL-Chat"
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=1024,
        max_num_seqs=2,
        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
        limit_mm_per_prompt={"image": len(image_urls)},
    )
    placeholders = "".join(
        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
    )

    # This model does not have a chat_template attribute on its tokenizer,
    # so we need to explicitly pass it. We use ChatML since it's used in the
    # generation utils of the model:
    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501

    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        chat_template=chat_template,
    )

    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        stop_token_ids=stop_token_ids,
        image_data=[fetch_image(url) for url in image_urls],
        chat_template=chat_template,
    )


def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import smart_resize
    except ModuleNotFoundError:
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
        smart_resize = None

    model_name = "Qwen/Qwen2-VL-7B-Instruct"

    # Tested on L40
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768 if smart_resize is None else 4096,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    if smart_resize is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    try:
        from qwen_vl_utils import smart_resize
    except ModuleNotFoundError:
        print(
            "WARNING: `qwen-vl-utils` not installed, input images will not "
            "be automatically resized. You can enable this functionality by "
            "`pip install qwen-vl-utils`."
        )
        smart_resize = None

    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768 if smart_resize is None else 4096,
        max_num_seqs=5,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        },
    ]

    processor = AutoProcessor.from_pretrained(model_name)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    if smart_resize is None:
        image_data = [fetch_image(url) for url in image_urls]
    else:

        def post_process_image(image: Image) -> Image:
            width, height = image.size
            resized_height, resized_width = smart_resize(
                height, width, max_pixels=1024 * 28 * 28
            )
            return image.resize((resized_width, resized_height))

        image_data = [post_process_image(fetch_image(url)) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "YannQi/R-4B"
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=16384,
        max_num_seqs=16,
        trust_remote_code=True,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]

    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)

    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"

    # The configuration below has been confirmed to launch on a single L40 GPU.
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        max_num_seqs=16,
        enforce_eager=True,
        limit_mm_per_prompt={"image": len(image_urls)},
        mm_processor_kwargs={
            "max_image_size": {"longest_edge": 384},
        },
    )

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )
    prompt = (
        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
    )
    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=[fetch_image(url) for url in image_urls],
    )


def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "stepfun-ai/step3-fp8"

    # NOTE: Below are verified configurations for step3-fp8
    # on 8xH100 GPUs.
    engine_args = EngineArgs(
        model=model_name,
        max_num_batched_tokens=4096,
        gpu_memory_utilization=0.85,
        tensor_parallel_size=8,
        limit_mm_per_prompt={"image": len(image_urls)},
        reasoning_parser="step3",
    )

    prompt = (
        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
        f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
        ">assistant\n<think>\n"
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": len(image_urls)},
    )

    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "omni-research/Tarsier2-Recap-7b"

    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=32768,
        limit_mm_per_prompt={"image": len(image_urls)},
        hf_overrides={
            "architectures": ["Tarsier2ForConditionalGeneration"],
            "model_type": "tarsier2",
        },
    )

    prompt = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
        f"<|vision_end|>{question}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


# GLM-4.5V
def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enforce_eager=True,
        tensor_parallel_size=4,
    )
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
    processor = AutoProcessor.from_pretrained(model_name)
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


# GLM-4.5V-FP8
def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V-FP8"

    engine_args = EngineArgs(
        model=model_name,
        max_model_len=32768,
        max_num_seqs=2,
        limit_mm_per_prompt={"image": len(image_urls)},
        enforce_eager=True,
        tensor_parallel_size=4,
    )
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": question},
            ],
        }
    ]
    processor = AutoProcessor.from_pretrained(model_name)
    prompt = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_data = [fetch_image(url) for url in image_urls]

    return ModelRequestData(
        engine_args=engine_args,
        prompt=prompt,
        image_data=image_data,
    )


model_example_map = {
    "aria": load_aria,
    "aya_vision": load_aya_vision,
    "bee": load_bee,
    "command_a_vision": load_command_a_vision,
    "deepseek_vl_v2": load_deepseek_vl2,
    "deepseek_ocr": load_deepseek_ocr,
    "gemma3": load_gemma3,
    "h2ovl_chat": load_h2ovl,
    "hunyuan_vl": load_hunyuan_vl,
    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
    "idefics3": load_idefics3,
    "interns1": load_interns1,
    "internvl_chat": load_internvl,
    "keye_vl": load_keye_vl,
    "keye_vl1_5": load_keye_vl1_5,
    "kimi_vl": load_kimi_vl,
    "llama4": load_llama4,
    "llava": load_llava,
    "llava-next": load_llava_next,
    "llava-onevision": load_llava_onevision,
    "mistral3": load_mistral3,
    "NVLM_D": load_nvlm_d,
    "ovis": load_ovis,
    "ovis2_5": load_ovis2_5,
    "paddleocr_vl": load_paddleocr_vl,
    "phi3_v": load_phi3v,
    "phi4_mm": load_phi4mm,
    "pixtral_hf": load_pixtral_hf,
    "qwen_vl_chat": load_qwen_vl_chat,
    "qwen2_vl": load_qwen2_vl,
    "qwen2_5_vl": load_qwen2_5_vl,
    "rvl": load_r_vl,
    "smolvlm": load_smolvlm,
    "step3": load_step3,
    "tarsier": load_tarsier,
    "tarsier2": load_tarsier2,
    "glm4_5v": load_glm4_5v,
    "glm4_5v_fp8": load_glm4_5v_fp8,
}


def run_generate(
    model,
    question: str,
    image_urls: list[str],
    seed: int,
    tensor_parallel_size: int | None,
):
    req_data = model_example_map[model](question, image_urls)

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    if tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = tensor_parallel_size
    llm = LLM(**engine_args)

    sampling_params = SamplingParams(
        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
    )

    outputs = llm.generate(
        {
            "prompt": req_data.prompt,
            "multi_modal_data": {"image": req_data.image_data},
        },
        sampling_params=sampling_params,
        lora_request=req_data.lora_requests,
    )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)


def run_chat(
    model: str,
    question: str,
    image_urls: list[str],
    seed: int,
    tensor_parallel_size: int | None,
):
    req_data = model_example_map[model](question, image_urls)

    # Disable other modalities to save memory
    default_limits = {"image": 0, "video": 0, "audio": 0}
    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
        req_data.engine_args.limit_mm_per_prompt or {}
    )

    engine_args = asdict(req_data.engine_args) | {"seed": seed}
    if tensor_parallel_size is not None:
        engine_args["tensor_parallel_size"] = tensor_parallel_size
    llm = LLM(**engine_args)

    sampling_params = (
        SamplingParams(
            temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
        )
        if req_data.sampling_params is None
        else req_data.sampling_params
    )
    outputs = llm.chat(
        [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": question,
                    },
                    *(
                        {
                            "type": "image_url",
                            "image_url": {"url": image_url},
                        }
                        for image_url in image_urls
                    ),
                ],
            }
        ],
        sampling_params=sampling_params,
        chat_template=req_data.chat_template,
        lora_request=req_data.lora_requests,
    )

    print("-" * 50)
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
        print("-" * 50)


def parse_args():
    parser = FlexibleArgumentParser(
        description="Demo on using vLLM for offline inference with "
        "vision language models that support multi-image input for text "
        "generation"
    )
    parser.add_argument(
        "--model-type",
        "-m",
        type=str,
        default="phi3_v",
        choices=model_example_map.keys(),
        help='Huggingface "model_type".',
    )
    parser.add_argument(
        "--method",
        type=str,
        default="generate",
        choices=["generate", "chat"],
        help="The method to run in `vllm.LLM`.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=0,
        help="Set the seed when initializing `vllm.LLM`.",
    )
    parser.add_argument(
        "--num-images",
        "-n",
        type=int,
        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
        default=2,
        help="Number of images to use for the demo.",
    )
    parser.add_argument(
        "--tensor-parallel-size",
        "-tp",
        type=int,
        default=None,
        help="Tensor parallel size to override the model's default setting. ",
    )
    return parser.parse_args()


def main(args: Namespace):
    model = args.model_type
    method = args.method
    seed = args.seed
    tensor_parallel_size = args.tensor_parallel_size

    if tensor_parallel_size is not None and tensor_parallel_size < 1:
        raise ValueError(
            f"tensor_parallel_size must be a positive integer, "
            f"got {tensor_parallel_size}"
        )

    image_urls = IMAGE_URLS[: args.num_images]

    if method == "generate":
        run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
    elif method == "chat":
        run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
    else:
        raise ValueError(f"Invalid method: {method}")


if __name__ == "__main__":
    args = parse_args()
    main(args)
-												[Misc] Add SPDX-License-Identifier headers to python source files (#12628)

- **Add SPDX license headers to python source files**
- **Check for SPDX headers using pre-commit**

commit 9d7ef44c3cfb72ca4c32e1c677d99259d10d4745
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:18:24 2025 -0500

    Add SPDX license headers to python source files
    
This commit adds SPDX license headers to python source files as
recommended to
the project by the Linux Foundation. These headers provide a concise way
that is
both human and machine readable for communicating license information
for each
source file. It helps avoid any ambiguity about the license of the code
and can
    also be easily used by tools to help manage license compliance.
    
The Linux Foundation runs license scans against the codebase to help
ensure
    we are in compliance with the licenses of the code we use, including
dependencies. Having these headers in place helps that tool do its job.
    
    More information can be found on the SPDX site:
    
    - https://spdx.dev/learn/handling-license-info/
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

commit 5a1cf1cb3b80759131c73f6a9dddebccac039dea
Author: Russell Bryant <rbryant@redhat.com>
Date:   Fri Jan 31 14:36:32 2025 -0500

    Check for SPDX headers using pre-commit
    
    Signed-off-by: Russell Bryant <rbryant@redhat.com>

---------

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-02-02 14:58:18 -05:00
+								# SPDX-License-Identifier: Apache-2.0
-												[Misc] Add SPDX-FileCopyrightText  (#19100)

Signed-off-by: simon-mo <simon.mo@hey.com>
											
										
										
											2025-06-03 11:20:17 -07:00
+								# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								"""
 								This example shows how to use vLLM for running offline inference with
-												[Model] Support E5-V (#9576)


											
										
										
											2024-10-23 11:35:29 +08:00
+								multi-image input on vision language models for text generation,
 								using the chat template defined by the model.
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								"""
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								import os
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								from argparse import Namespace
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								from dataclasses import asdict
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								from typing import NamedTuple
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								from huggingface_hub import snapshot_download
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								from PIL.Image import Image
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								from transformers import AutoProcessor, AutoTokenizer
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								from vllm import LLM, EngineArgs, SamplingParams
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								from vllm.lora.request import LoRARequest
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								from vllm.multimodal.utils import fetch_image
-												[Chore]:Extract math and argparse utilities to separate modules (#27188)

Signed-off-by: Yeshwanth Surya <yeshsurya@gmail.com>
Signed-off-by: Yeshwanth N <yeshsurya@gmail.com>
Signed-off-by: yeshsurya <yeshsurya@gmail.com>
											
										
										
											2025-10-26 16:33:32 +05:30
+								from vllm.utils.argparse_utils import FlexibleArgumentParser
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
 								QUESTION = "What is the content of each image?"
 								IMAGE_URLS = [
-												[chore] Move some wikimedia images to S3 (#28351)

Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
											
										
										
											2025-11-08 17:58:26 -08:00
+								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/flycatcher.jpeg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/somefish.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/starfish.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/snail.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/thistle.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/husky.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/orangetabbycat.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/guineapig.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/rabbit.jpg",
 								    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/horsepony.jpg",
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								]
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								class ModelRequestData(NamedTuple):
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args: EngineArgs
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								    prompt: str
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								    image_data: list[Image]
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    stop_token_ids: list[int] | None = None
 								    chat_template: str | None = None
 								    lora_requests: list[LoRARequest] | None = None
-												[Bugfix] Fix deepseek-ocr multi-image inference and add `merge_by_field_config=True` with tensor schema support (#27361)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-10-23 08:15:38 +08:00
+								    sampling_params: SamplingParams | None = None
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
 								# lower-end GPUs.
 								# Unless specified, these settings have been tested to work on a single L4.
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    model_name = "rhymes-ai/Aria"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        tokenizer_mode="slow",
 								        trust_remote_code=True,
 								        dtype="bfloat16",
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompt = (
 								        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
-												[Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-08 10:17:16 +08:00
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								        prompt=prompt,
 								        stop_token_ids=stop_token_ids,
 								        image_data=[fetch_image(url) for url in image_urls],
-												[Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-08 10:17:16 +08:00
+								    )
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Misc] Rename CohereForAI references to CohereLabs (#30147)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
											
										
										
											2025-12-05 14:41:40 -05:00
+								    model_name = "CohereLabs/aya-vision-8b"
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
 								    processor = AutoProcessor.from_pretrained(model_name)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model][VLM] Support Bee-8B Model (#27012)

Signed-off-by: uyzhang <yi.zhang.4096@gmail.com>
Signed-off-by: Yi Zhang <zhangyi970819@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-20 10:31:26 +08:00
+								def load_bee(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "Open-Bee/Bee-8B-RL"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=16384,
 								        max_num_seqs=16,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        trust_remote_code=True,
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
 								    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[New Model] Support Command-A-Vision (#22660)

Signed-off-by: donglu <donglu@cohere.com>
											
										
										
											2025-08-12 04:39:54 -04:00
+								def load_command_a_vision(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "CohereLabs/command-a-vision-07-2025"
 								    # NOTE: This model is 122B parameters and requires tensor parallelism
 								    # Recommended to use tp=4 on H100 GPUs
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=32768,
 								        tensor_parallel_size=4,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
 								    processor = AutoProcessor.from_pretrained(model_name)
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Model] Add support for deepseek-vl2-tiny model (#12068)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-01-17 01:14:48 +08:00
+								    model_name = "deepseek-ai/deepseek-vl2-tiny"
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholder = "".join(
 								        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								    prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Model] Initialize support for Deepseek-VL2 models (#11578)

Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-01-12 16:17:24 +08:00
+								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Bugfix] Fix deepseek-ocr multi-image inference and add `merge_by_field_config=True` with tensor schema support (#27361)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-10-23 08:15:38 +08:00
+								def load_deepseek_ocr(question: str, image_urls: list[str]) -> ModelRequestData:
 								    from vllm.model_executor.models.deepseek_ocr import NGramPerReqLogitsProcessor
 								    model_name = "deepseek-ai/DeepSeek-OCR"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        logits_processors=[NGramPerReqLogitsProcessor],
 								    )
 								    placeholder = "<image>\n" * len(image_urls)
 								    prompt = placeholder + question
 								    # The following sampling params config is taken from
 								    # the official Deepseek-OCR inference example.
 								    # (IMPORTANT) Use the custom logits processor and avoid skipping
 								    # special tokens for this model for the optimal OCR performance.
 								    sampling_params = SamplingParams(
 								        temperature=0.0,
 								        max_tokens=8192,
 								        # ngram logit processor args
 								        extra_args=dict(
 								            ngram_size=30,
 								            window_size=90,
 								            # whitelist: <td>, </td>
 								            whitelist_token_ids={128821, 128822},
 								        ),
 								        skip_special_tokens=False,
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								        sampling_params=sampling_params,
 								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								    model_name = "google/gemma-3-4b-it"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-03-13 17:23:12 +08:00
+								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
 								    processor = AutoProcessor.from_pretrained(model_name)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
-												[V1] Update doc and examples for H2O-VL (#13349)

Signed-off-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-02-16 02:35:54 -08:00
+								    model_name = "h2oai/h2ovl-mississippi-800m"
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192,
 								        limit_mm_per_prompt={"image": len(image_urls)},
-												[VLM][Bugfix] Pass processor kwargs properly on init (#13516)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-02-19 21:13:50 +08:00
+								        mm_processor_kwargs={"max_dynamic_patch": 4},
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "\n".join(
 								        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
 								    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    prompt = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
 								    # Stop tokens for H2OVL-Mississippi
-												[V1] Update doc and examples for H2O-VL (#13349)

Signed-off-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-02-16 02:35:54 -08:00
+								    # https://huggingface.co/h2oai/h2ovl-mississippi-800m
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
+								    stop_token_ids = [tokenizer.eos_token_id]
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Model] Add support for H2OVL-Mississippi models (#9747)

Signed-off-by: Shanshan Wang <shanshan.wang@h2o.ai>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-11-03 18:15:36 -06:00
+								        prompt=prompt,
 								        stop_token_ids=stop_token_ids,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Bugfix] Fix incorrect `image_grid_thw` rank for HunyuanOCR from missing `merge_by_field_config=True` (#29950)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-12-03 18:05:10 +08:00
+								# HunyuanOCR
 								def load_hunyuan_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "tencent/HunyuanOCR"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholder = (
 								        "<｜hy_place▁holder▁no▁100｜><｜hy_place▁holder▁no▁102｜><｜hy_place▁holder▁no▁101｜>"  # noqa: E501
 								    ) * len(image_urls)
 								    prompt = f"<｜hy_begin▁of▁sentence｜>{placeholder}{question}<｜hy_User｜>"
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								def load_hyperclovax_seed_vision(
 								    question: str, image_urls: list[str]
 								) -> ModelRequestData:
 								    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
 								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        trust_remote_code=True,
 								        max_model_len=16384,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    message = {"role": "user", "content": list()}
 								    for _image_url in image_urls:
 								        message["content"].append(
 								            {
 								                "type": "image",
 								                "image": _image_url,
 								                "ocr": "",
 								                "lens_keywords": "",
 								                "lens_local_keywords": "",
 								            }
 								        )
 								    message["content"].append(
 								        {
 								            "type": "text",
 								            "text": question,
 								        }
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
 								    prompt = tokenizer.apply_chat_template(
 								        [
 								            message,
 								        ],
 								        tokenize=False,
 								        add_generation_prompt=True,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        prompt=prompt,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        stop_token_ids=None,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
 								    # The configuration below has been confirmed to launch on a single L40 GPU.
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=16,
 								        enforce_eager=True,
 								        limit_mm_per_prompt={"image": len(image_urls)},
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								        # if you are running out of memory, you can reduce the "longest_edge".
 								        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								        mm_processor_kwargs={
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								            "size": {"longest_edge": 2 * 364},
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								        },
 								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "\n".join(
 								        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-26 19:14:04 +08:00
+								def load_interns1(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Model] Use `merge_by_field_config` for MM models (InternVL family) (#26153)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-03 16:59:06 +08:00
+								    model_name = "internlm/Intern-S1-mini"
-												Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-26 19:14:04 +08:00
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = "\n".join(
 								        f"Image-{i}: <IMG_CONTEXT>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
 								    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
 								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    prompt = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								    model_name = "OpenGVLab/InternVL2-2B"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        limit_mm_per_prompt={"image": len(image_urls)},
-												[Model] Expose InternVL2 max_dynamic_patch as a mm_processor_kwarg (#8946)


											
										
										
											2024-09-30 13:01:20 +08:00
+								        mm_processor_kwargs={"max_dynamic_patch": 4},
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "\n".join(
 								        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
 								    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    prompt = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
 								    # Stop tokens for InternVL
 								    # models variants may have different stop tokens
 								    # please refer to the model card for the correct "stop words":
-												[Doc] Explicitly state that InternVL 2.5 is supported (#10978)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-12-08 00:58:02 +08:00
+								    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								        prompt=prompt,
 								        stop_token_ids=stop_token_ids,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
-												[MODEL] New model support for naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B (#20931)

Signed-off-by: bigshanedogg <bigshane319@gmail.com>
											
										
										
											2025-07-25 22:05:42 +09:00
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        trust_remote_code=True,
 								        max_model_len=8192,
 								        max_num_seqs=5,
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        },
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    ]
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    image_data = [fetch_image(url) for url in image_urls]
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        image_data=image_data,
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								def load_keye_vl1_5(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "Kwai-Keye/Keye-VL-1_5-8B"
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        trust_remote_code=True,
 								        max_model_len=32768,
 								        max_num_seqs=5,
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        },
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    ]
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    image_data = [fetch_image(url) for url in image_urls]
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        image_data=image_data,
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "moonshotai/Kimi-VL-A3B-Instruct"
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        trust_remote_code=True,
 								        max_model_len=4096,
 								        max_num_seqs=4,
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								    engine_args = EngineArgs(
 								        model=model_name,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        max_model_len=131072,
 								        tensor_parallel_size=8,
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
 								    processor = AutoProcessor.from_pretrained(model_name)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												Upstream Llama4 Support to Main (#16113)

Signed-off-by: Aston Zhang <22279212+astonzhang@users.noreply.github.com>
Signed-off-by: Chris Thi <chris.c.thi@gmail.com>
Signed-off-by: drisspg <drisspguessous@gmail.com>
Signed-off-by: Jon Swenson <jmswen@gmail.com>
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
Signed-off-by: Lu Fang <fanglu@meta.com>
Signed-off-by: Xiaodong Wang <xdwang@meta.com>
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Zijing Liu <liuzijing2014@gmail.com>
Signed-off-by: Lu Fang <lufang@fb.com>
Signed-off-by: Lu Fang <fanglu@fb.com>
Signed-off-by: Lucia Fang <fanglu@fb.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Lu Fang <fanglu@fb.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-07 08:06:27 -07:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
 								    # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
 								    # it will generate poor response for multi-image inputs!
 								    model_name = "llava-hf/llava-1.5-7b-hf"
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        max_num_seqs=16,
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        }
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
+								    ]
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    processor = AutoProcessor.from_pretrained(model_name)
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        image_data=[fetch_image(url) for url in image_urls],
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
+								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        max_model_len=8192,
 								        max_num_seqs=16,
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        }
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
+								    ]
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    processor = AutoProcessor.from_pretrained(model_name)
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        image_data=[fetch_image(url) for url in image_urls],
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
+								    )
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
+								    engine_args = EngineArgs(
 								        model=model_name,
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								        max_model_len=16384,
 								        max_num_seqs=16,
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
-												[Model] Use `merge_by_field_config` for MM models (Llava family) (#26280)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-10-06 17:45:26 +08:00
+								    processor = AutoProcessor.from_pretrained(model_name)
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
 								    # Adjust this as necessary to fit in GPU
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        tensor_parallel_size=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
-												Add ignore consolidated file in mistral example code (#20420)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-07-04 10:55:07 +08:00
+								        ignore_patterns=["consolidated.safetensors"],
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								    )
 								    placeholders = "[IMG]" * len(image_urls)
 								    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    model_name = "nvidia/NVLM-D-72B"
 								    # Adjust this as necessary to fit in GPU
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192,
 								        tensor_parallel_size=4,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        mm_processor_kwargs={"max_dynamic_patch": 4},
 								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "\n".join(
 								        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
 								    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    prompt = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-05-12 08:56:30 +08:00
+								# Ovis
 								def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
-												[MODEL ADDITION] Ovis2 Model Addition (#15826)

Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-04-30 09:33:29 +02:00
+								    model_name = "AIDC-AI/Ovis2-1B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        trust_remote_code=True,
 								        dtype="half",
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "\n".join(
 								        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
 								    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
-												[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-05-12 08:56:30 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 								    prompt = tokenizer.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[MODEL ADDITION] Ovis2 Model Addition (#15826)

Signed-off-by: Marco <121761685+mlinmg@users.noreply.github.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-04-30 09:33:29 +02:00
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 21:12:59 +08:00
+								# ovis2_5
 								def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "AIDC-AI/Ovis2.5-2B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        trust_remote_code=True,
 								        dtype="half",
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = "\n".join(
 								        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
-												[Model] Use `merge_by_field_config` for MM models (Ovis family) (#26308)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-10-07 20:54:22 +08:00
+								    prompt = (
 								        f"<|im_start|>user\n\n{placeholders}\n{question}<|im_end|>\n"
 								        "<|im_start|>assistant\n"
-												[Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 21:12:59 +08:00
+								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Model] Add PaddleOCR-VL Model Support  (#27758)

Signed-off-by: zhangyue <zhangyue66@baidu.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zhangyue66 <zhangyue66@baidu.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-11-03 19:04:22 +08:00
+								def load_paddleocr_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "PaddlePaddle/PaddleOCR-VL"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" * len(image_urls)
 								    prompt = f"<|begin_of_sentence|>User: {question}{placeholders}\nAssistant: "
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-08 10:17:16 +08:00
+								    model_name = "mistral-community/pixtral-12b"
 								    # Adjust this as necessary to fit in GPU
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-08 10:17:16 +08:00
+								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=2,
 								        tensor_parallel_size=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = "[IMG]" * len(image_urls)
 								    prompt = f"<s>[INST]{question}\n{placeholders}[/INST]"
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-08 10:17:16 +08:00
+								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												Update deprecated Python 3.8 typing (#13971)


											
										
										
											2025-03-03 01:34:51 +00:00
+								def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    # num_crops is an override kwarg to the multimodal image processor;
 								    # For some models, e.g., Phi-3.5-vision-instruct, it is recommended
 								    # to use 16 for single frame scenarios, and 4 for multi-frame.
 								    #
 								    # Generally speaking, a larger value for num_crops results in more
 								    # tokens per image instance, because it may scale the image more in
 								    # the image preprocessing. Some references in the model docs and the
 								    # formula for image tokens after the preprocessing
 								    # transform can be found below.
 								    #
 								    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
 								    # https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model="microsoft/Phi-3.5-vision-instruct",
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        mm_processor_kwargs={"num_crops": 4},
 								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "\n".join(
 								        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
 								    """
 								    Phi-4-multimodal-instruct supports both image and audio inputs. Here, we
 								    show how to process multi images inputs.
 								    """
 								    model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
 								    # Since the vision-lora and speech-lora co-exist with the base model,
 								    # we have to manually specify the path of the lora weights.
 								    vision_lora_path = os.path.join(model_path, "vision-lora")
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								        model=model_path,
 								        trust_remote_code=True,
-												[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-19 17:26:11 +08:00
+								        max_model_len=4096,
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        enable_lora=True,
 								        max_lora_rank=320,
-												[Model] Refactor Phi-4-multimodal to use merged processor and support V1 (#15477)

Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-19 17:26:11 +08:00
+								        # Note - mm_processor_kwargs can also be passed to generate/chat calls
 								        mm_processor_kwargs={"dynamic_hd": 4},
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        lora_requests=[LoRARequest("vision", 1, vision_lora_path)],
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    model_name = "Qwen/Qwen-VL-Chat"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=1024,
 								        max_num_seqs=2,
-												[VLM] Separate text-only and vision variants of the same model architecture (#13157)


											
										
										
											2025-02-13 22:19:15 +08:00
+								        hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    placeholders = "".join(
 								        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    # This model does not have a chat_template attribute on its tokenizer,
 								    # so we need to explicitly pass it. We use ChatML since it's used in the
 								    # generation utils of the model:
 								    # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
 								    chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
 								    prompt = tokenizer.apply_chat_template(
 								        messages,
 								        tokenize=False,
 								        add_generation_prompt=True,
 								        chat_template=chat_template,
 								    )
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
 								    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
 								    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
-												[Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-08 10:17:16 +08:00
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								        prompt=prompt,
 								        stop_token_ids=stop_token_ids,
 								        image_data=[fetch_image(url) for url in image_urls],
 								        chat_template=chat_template,
 								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								    try:
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								        from qwen_vl_utils import smart_resize
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								    except ModuleNotFoundError:
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        print(
 								            "WARNING: `qwen-vl-utils` not installed, input images will not "
 								            "be automatically resized. You can enable this functionality by "
 								            "`pip install qwen-vl-utils`."
 								        )
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								        smart_resize = None
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
 								    model_name = "Qwen/Qwen2-VL-7B-Instruct"
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								    # Tested on L40
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								        model=model_name,
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								        max_model_len=32768 if smart_resize is None else 4096,
-												[CI/Build] Update models tests & examples (#8874)

Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2024-09-29 00:54:35 +08:00
+								        max_num_seqs=5,
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    messages = [
 								        {"role": "system", "content": "You are a helpful assistant."},
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        },
 								    ]
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
 								    processor = AutoProcessor.from_pretrained(model_name)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								    if smart_resize is None:
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
+								        image_data = [fetch_image(url) for url in image_urls]
 								    else:
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
 								        def post_process_image(image: Image) -> Image:
 								            width, height = image.size
 								            resized_height, resized_width = smart_resize(
 								                height, width, max_pixels=1024 * 28 * 28
 								            )
 								            return image.resize((resized_width, resized_height))
 								        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
-												[Model][VLM] Add Qwen2-VL model support (#7905)

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 00:31:19 +08:00
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								        prompt=prompt,
 								        image_data=image_data,
 								    )
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    try:
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								        from qwen_vl_utils import smart_resize
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    except ModuleNotFoundError:
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        print(
 								            "WARNING: `qwen-vl-utils` not installed, input images will not "
 								            "be automatically resized. You can enable this functionality by "
 								            "`pip install qwen-vl-utils`."
 								        )
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								        smart_resize = None
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
 								    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = EngineArgs(
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								        model=model_name,
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								        max_model_len=32768 if smart_resize is None else 4096,
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								        max_num_seqs=5,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    messages = [
 								        {"role": "system", "content": "You are a helpful assistant."},
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        },
 								    ]
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
 								    processor = AutoProcessor.from_pretrained(model_name)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
+								    if smart_resize is None:
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								        image_data = [fetch_image(url) for url in image_urls]
 								    else:
-												Fix #19130 (#19132)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-04 16:42:06 +08:00
 								        def post_process_image(image: Image) -> Image:
 								            width, height = image.size
 								            resized_height, resized_width = smart_resize(
 								                height, width, max_pixels=1024 * 28 * 28
 								            )
 								            return image.resize((resized_width, resized_height))
 								        image_data = [post_process_image(fetch_image(url)) for url in image_urls]
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
 								    return ModelRequestData(
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								        engine_args=engine_args,
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								        prompt=prompt,
 								        image_data=image_data,
 								    )
-												[Model][VLM] Support R-4B Model (#23246)

Signed-off-by: yannqi <yannqi@qq.com>
Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: yannqiyang <yannqiyang@tencent.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-08-21 12:08:52 +08:00
+								def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "YannQi/R-4B"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=16384,
 								        max_num_seqs=16,
-												[BugFix] Fix R-VL model loading error (#29299)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-24 14:48:45 +08:00
+								        trust_remote_code=True,
-												[Model][VLM] Support R-4B Model (#23246)

Signed-off-by: yannqi <yannqi@qq.com>
Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: yannqiyang <yannqiyang@tencent.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-08-21 12:08:52 +08:00
+								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
 								    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
 								    # The configuration below has been confirmed to launch on a single L40 GPU.
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=8192,
 								        max_num_seqs=16,
 								        enforce_eager=True,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        mm_processor_kwargs={
 								            "max_image_size": {"longest_edge": 384},
 								        },
 								    )
 								    placeholders = "\n".join(
 								        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
 								    )
 								    prompt = (
 								        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
 								    )
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=[fetch_image(url) for url in image_urls],
 								    )
 								def load_step3(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "stepfun-ai/step3-fp8"
 								    # NOTE: Below are verified configurations for step3-fp8
 								    # on 8xH100 GPUs.
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_num_batched_tokens=4096,
 								        gpu_memory_utilization=0.85,
 								        tensor_parallel_size=8,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        reasoning_parser="step3",
 								    )
 								    prompt = (
 								        "<｜begin▁of▁sentence｜> You are a helpful assistant. <|BOT|>user\n "
 								        f"{'<im_patch>' * len(image_urls)}{question} <|EOT|><|BOT|"
 								        ">assistant\n<think>\n"
 								    )
 								    image_data = [fetch_image(url) for url in image_urls]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=image_data,
 								    )
-												Add tarsier model support (#18985)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-03 13:13:13 +08:00
+								def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "omni-research/Tarsier-7b"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=4096,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								    )
 								    prompt = f"USER: {'<image>' * len(image_urls)}\n{question}\n ASSISTANT:"
 								    image_data = [fetch_image(url) for url in image_urls]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=image_data,
 								    )
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
+								def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "omni-research/Tarsier2-Recap-7b"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        trust_remote_code=True,
 								        max_model_len=32768,
 								        limit_mm_per_prompt={"image": len(image_urls)},
-												Fix some more Transformers nightly tests (#29872)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-12-02 21:49:44 +00:00
+								        hf_overrides={
 								            "architectures": ["Tarsier2ForConditionalGeneration"],
 								            "model_type": "tarsier2",
 								        },
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
+								    )
 								    prompt = (
 								        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
 								        f"<|im_start|>user\n<|vision_start|>{'<|image_pad|>' * len(image_urls)}"
 								        f"<|vision_end|>{question}<|im_end|>\n"
 								        "<|im_start|>assistant\n"
 								    )
 								    image_data = [fetch_image(url) for url in image_urls]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=image_data,
 								    )
-												Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-08-19 16:56:31 +09:00
+								# GLM-4.5V
 								def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "zai-org/GLM-4.5V"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=32768,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        enforce_eager=True,
 								        tensor_parallel_size=4,
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
 								    processor = AutoProcessor.from_pretrained(model_name)
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    image_data = [fetch_image(url) for url in image_urls]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=image_data,
 								    )
 								# GLM-4.5V-FP8
 								def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
 								    model_name = "zai-org/GLM-4.5V-FP8"
 								    engine_args = EngineArgs(
 								        model=model_name,
 								        max_model_len=32768,
 								        max_num_seqs=2,
 								        limit_mm_per_prompt={"image": len(image_urls)},
 								        enforce_eager=True,
 								        tensor_parallel_size=4,
 								    )
 								    placeholders = [{"type": "image", "image": url} for url in image_urls]
 								    messages = [
 								        {
 								            "role": "user",
 								            "content": [
 								                *placeholders,
 								                {"type": "text", "text": question},
 								            ],
 								        }
 								    ]
 								    processor = AutoProcessor.from_pretrained(model_name)
 								    prompt = processor.apply_chat_template(
 								        messages, tokenize=False, add_generation_prompt=True
 								    )
 								    image_data = [fetch_image(url) for url in image_urls]
 								    return ModelRequestData(
 								        engine_args=engine_args,
 								        prompt=prompt,
 								        image_data=image_data,
 								    )
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								model_example_map = {
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "aria": load_aria,
-												[Model] Aya Vision (#15441)

Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
											
										
										
											2025-04-01 09:30:43 -07:00
+								    "aya_vision": load_aya_vision,
-												[Model][VLM] Support Bee-8B Model (#27012)

Signed-off-by: uyzhang <yi.zhang.4096@gmail.com>
Signed-off-by: Yi Zhang <zhangyi970819@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
											
										
										
											2025-10-20 10:31:26 +08:00
+								    "bee": load_bee,
-												[New Model] Support Command-A-Vision (#22660)

Signed-off-by: donglu <donglu@cohere.com>
											
										
										
											2025-08-12 04:39:54 -04:00
+								    "command_a_vision": load_command_a_vision,
-												[Model] Port deepseek-vl2 processor, remove dependency (#12169)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-01-18 13:59:39 +08:00
+								    "deepseek_vl_v2": load_deepseek_vl2,
-												[Bugfix] Fix deepseek-ocr multi-image inference and add `merge_by_field_config=True` with tensor schema support (#27361)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-10-23 08:15:38 +08:00
+								    "deepseek_ocr": load_deepseek_ocr,
-												[Model] Add support for Gemma 3 (#14660)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-12 08:36:33 -07:00
+								    "gemma3": load_gemma3,
-												[VLM] Separate text-only and vision variants of the same model architecture (#13157)


											
										
										
											2025-02-13 22:19:15 +08:00
+								    "h2ovl_chat": load_h2ovl,
-												[Bugfix] Fix incorrect `image_grid_thw` rank for HunyuanOCR from missing `merge_by_field_config=True` (#29950)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-12-03 18:05:10 +08:00
+								    "hunyuan_vl": load_hunyuan_vl,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    "hyperclovax_seed_vision": load_hyperclovax_seed_vision,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "idefics3": load_idefics3,
-												Support Intern-S1 (#21628)

Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Your Name <you@example.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-07-26 19:14:04 +08:00
+								    "interns1": load_interns1,
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								    "internvl_chat": load_internvl,
-												[Model][VLM] Support Keye-VL-8B-Preview (#20126)

Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
											
										
										
											2025-07-02 14:35:04 +08:00
+								    "keye_vl": load_keye_vl,
-												[Model]: support KeyeVL-1_5-8B (#23838)

Signed-off-by: wangruitao <wangruitao@kuaishou.com>
Co-authored-by: wangruitao <wangruitao@kuaishou.com>
											
										
										
											2025-09-01 18:50:27 +08:00
+								    "keye_vl1_5": load_keye_vl1_5,
-												[Model][VLM] Add Kimi-VL model support (#16387)

Signed-off-by: courage17340 <courage17340@163.com>
											
										
										
											2025-04-15 05:41:48 +08:00
+								    "kimi_vl": load_kimi_vl,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    "llama4": load_llama4,
-												[Doc] Add missing llava family multi-image examples (#19698)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-06-17 15:05:21 +08:00
+								    "llava": load_llava,
 								    "llava-next": load_llava_next,
 								    "llava-onevision": load_llava_onevision,
-												[Model] Support Mistral3 in the HF Transformers format (#15505)

Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-04-01 07:10:05 -06:00
+								    "mistral3": load_mistral3,
-												[Model] Support NVLM-D and fix QK Norm in InternViT (#9045)

Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2024-10-07 19:55:12 +08:00
+								    "NVLM_D": load_nvlm_d,
-												[Model] Broadcast Ovis2 implementation to fit Ovis1.6 (#17861)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-05-12 08:56:30 +08:00
+								    "ovis": load_ovis,
-												[Model] support new model ovis2.5 (#23084)

Signed-off-by: myselvess <244285088@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-08-19 21:12:59 +08:00
+								    "ovis2_5": load_ovis2_5,
-												[Model] Add PaddleOCR-VL Model Support  (#27758)

Signed-off-by: zhangyue <zhangyue66@baidu.com>
Signed-off-by: Roger Wang <hey@rogerw.io>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: zhangyue66 <zhangyue66@baidu.com>
Co-authored-by: Roger Wang <hey@rogerw.io>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-11-03 19:04:22 +08:00
+								    "paddleocr_vl": load_paddleocr_vl,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "phi3_v": load_phi3v,
-												[Misc] Add Phi4-MM example (#14343)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
											
										
										
											2025-03-08 01:28:52 +08:00
+								    "phi4_mm": load_phi4mm,
-												[Bugfix] Fix image input for Pixtral-HF (#11741)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-01-08 10:17:16 +08:00
+								    "pixtral_hf": load_pixtral_hf,
 								    "qwen_vl_chat": load_qwen_vl_chat,
-												[Doc] Reorder vision language examples in alphabet order (#11228)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2024-12-16 19:23:33 +08:00
+								    "qwen2_vl": load_qwen2_vl,
-												[VLM] Qwen2.5-VL


											
										
										
											2025-02-05 13:31:38 -08:00
+								    "qwen2_5_vl": load_qwen2_5_vl,
-												[Model][VLM] Support R-4B Model (#23246)

Signed-off-by: yannqi <yannqi@qq.com>
Signed-off-by: 杨奇(yann qi) <51905299+yannqi@users.noreply.github.com>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: yannqiyang <yannqiyang@tencent.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
											
										
										
											2025-08-21 12:08:52 +08:00
+								    "rvl": load_r_vl,
-												[Model] Add smolvlm support (#16017)

Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
											
										
										
											2025-04-09 10:12:17 +08:00
+								    "smolvlm": load_smolvlm,
-												[Doc] Add example for Step3-VL (#22061)

Signed-off-by: Roger Wang <hey@rogerw.me>
											
										
										
											2025-08-01 01:35:49 -07:00
+								    "step3": load_step3,
-												Add tarsier model support (#18985)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-03 13:13:13 +08:00
+								    "tarsier": load_tarsier,
-												[New model support]Support Tarsier2 (#19887)

Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
											
										
										
											2025-06-21 12:01:51 +08:00
+								    "tarsier2": load_tarsier2,
-												Fix GLM-4.5V-FP8 numerical issue (#22949)

Signed-off-by: qizixi <qizixi@meta.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
											
										
										
											2025-08-19 16:56:31 +09:00
+								    "glm4_5v": load_glm4_5v,
 								    "glm4_5v_fp8": load_glm4_5v_fp8,
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								}
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								def run_generate(
 								    model,
 								    question: str,
 								    image_urls: list[str],
-												[Deprecation] Remove deprecated task, seed and MM settings (#30397)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-12-11 11:59:39 +08:00
+								    seed: int,
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    tensor_parallel_size: int | None,
 								):
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								    req_data = model_example_map[model](question, image_urls)
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    engine_args = asdict(req_data.engine_args) | {"seed": seed}
 								    if tensor_parallel_size is not None:
 								        engine_args["tensor_parallel_size"] = tensor_parallel_size
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    llm = LLM(**engine_args)
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    sampling_params = SamplingParams(
 								        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
 								    )
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    outputs = llm.generate(
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
+								        {
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								            "prompt": req_data.prompt,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								            "multi_modal_data": {"image": req_data.image_data},
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								        },
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								        sampling_params=sampling_params,
 								        lora_request=req_data.lora_requests,
 								    )
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								    print("-" * 50)
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								    for o in outputs:
 								        generated_text = o.outputs[0].text
 								        print(generated_text)
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								        print("-" * 50)
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								def run_chat(
 								    model: str,
 								    question: str,
 								    image_urls: list[str],
-												[Deprecation] Remove deprecated task, seed and MM settings (#30397)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-12-11 11:59:39 +08:00
+								    seed: int,
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    tensor_parallel_size: int | None,
 								):
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								    req_data = model_example_map[model](question, image_urls)
-												[Model][VLM] Support multi-images inputs for InternVL2 models (#8201)


											
										
										
											2024-09-07 16:38:23 +08:00
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
+								    # Disable other modalities to save memory
 								    default_limits = {"image": 0, "video": 0, "audio": 0}
 								    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        req_data.engine_args.limit_mm_per_prompt or {}
 								    )
-												[V1] Enable multi-input by default (#15799)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-04-12 16:52:39 +08:00
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    engine_args = asdict(req_data.engine_args) | {"seed": seed}
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    if tensor_parallel_size is not None:
 								        engine_args["tensor_parallel_size"] = tensor_parallel_size
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    llm = LLM(**engine_args)
-												[Bugfix] Fix deepseek-ocr multi-image inference and add `merge_by_field_config=True` with tensor schema support (#27361)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
											
										
										
											2025-10-23 08:15:38 +08:00
+								    sampling_params = (
 								        SamplingParams(
 								            temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
 								        )
 								        if req_data.sampling_params is None
 								        else req_data.sampling_params
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    )
-												[Misc] Add `--seed` option to offline multi-modal examples (#14934)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-03-17 18:00:17 +08:00
+								    outputs = llm.chat(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        [
 								            {
 								                "role": "user",
 								                "content": [
 								                    {
 								                        "type": "text",
 								                        "text": question,
-												[Model] Support multiple images for qwen-vl (#8247)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 11:10:54 -06:00
+								                    },
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								                    *(
 								                        {
 								                            "type": "image_url",
 								                            "image_url": {"url": image_url},
 								                        }
 								                        for image_url in image_urls
 								                    ),
 								                ],
 								            }
 								        ],
-												[Model] Support multiple images for qwen-vl (#8247)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 11:10:54 -06:00
+								        sampling_params=sampling_params,
-												[Misc] Use NamedTuple in Multi-image example (#8705)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
											
										
										
											2024-09-22 06:56:20 -06:00
+								        chat_template=req_data.chat_template,
-												[Bugfix][VLM] Fix failing Phi-4-MM multi-images tests and add vision-speech test (#16424)

Signed-off-by: Isotr0py <2037008807@qq.com>
											
										
										
											2025-04-11 12:57:16 +08:00
+								        lora_request=req_data.lora_requests,
-												[Model] Support multiple images for qwen-vl (#8247)

Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2024-09-12 11:10:54 -06:00
+								    )
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								    print("-" * 50)
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								    for o in outputs:
 								        generated_text = o.outputs[0].text
 								        print(generated_text)
-												[Misc] format and refactor some examples (#16252)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-08 18:42:32 +08:00
+								        print("-" * 50)
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								def parse_args():
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								    parser = FlexibleArgumentParser(
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        description="Demo on using vLLM for offline inference with "
 								        "vision language models that support multi-image input for text "
 								        "generation"
 								    )
 								    parser.add_argument(
 								        "--model-type",
 								        "-m",
 								        type=str,
 								        default="phi3_v",
 								        choices=model_example_map.keys(),
 								        help='Huggingface "model_type".',
 								    )
 								    parser.add_argument(
 								        "--method",
 								        type=str,
 								        default="generate",
 								        choices=["generate", "chat"],
 								        help="The method to run in `vllm.LLM`.",
 								    )
 								    parser.add_argument(
 								        "--seed",
 								        type=int,
-												[Deprecation] Remove deprecated task, seed and MM settings (#30397)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
											
										
										
											2025-12-11 11:59:39 +08:00
+								        default=0,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        help="Set the seed when initializing `vllm.LLM`.",
 								    )
-												[Model] Remove image mm limit for LLaMa4  (#16365)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
											
										
										
											2025-04-10 02:36:27 -07:00
+								    parser.add_argument(
 								        "--num-images",
 								        "-n",
-												[Bugfix] Fix missing int type for `-n` in multi-image example (#17223)


											
										
										
											2025-04-26 16:49:52 +08:00
+								        type=int,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
-												[Model] Remove image mm limit for LLaMa4  (#16365)

Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
											
										
										
											2025-04-10 02:36:27 -07:00
+								        default=2,
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								        help="Number of images to use for the demo.",
 								    )
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    parser.add_argument(
 								        "--tensor-parallel-size",
 								        "-tp",
 								        type=int,
 								        default=None,
 								        help="Tensor parallel size to override the model's default setting. ",
 								    )
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								    return parser.parse_args()
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								def main(args: Namespace):
 								    model = args.model_type
 								    method = args.method
 								    seed = args.seed
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								    tensor_parallel_size = args.tensor_parallel_size
 								    if tensor_parallel_size is not None and tensor_parallel_size < 1:
 								        raise ValueError(
 								            f"tensor_parallel_size must be a positive integer, "
 								            f"got {tensor_parallel_size}"
 								        )
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
-												Convert `examples` to `ruff-format` (#18400)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
											
										
										
											2025-05-26 17:57:54 +01:00
+								    image_urls = IMAGE_URLS[: args.num_images]
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
 								    if method == "generate":
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								        run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								    elif method == "chat":
-												Add TP CLI argument to multimodal inference examples (#29301)

Signed-off-by: Lin, Fanli <fanli.lin@intel.com>
											
										
										
											2025-11-25 14:03:20 +08:00
+								        run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
-												[Misc] refactor argument parsing in examples (#16635)

Signed-off-by: reidliu41 <reid201711@gmail.com>
Co-authored-by: reidliu41 <reid201711@gmail.com>
											
										
										
											2025-04-15 16:05:30 +08:00
+								    else:
 								        raise ValueError(f"Invalid method: {method}")
 								if __name__ == "__main__":
 								    args = parse_args()
-												[Doc] Indicate more information about supported modalities (#8181)


											
										
										
											2024-09-05 18:51:53 +08:00
+								    main(args)